diff --git a/DATA_SuPER/DataV_SuPER.py b/DATA_SuPER/DataV_SuPER.py new file mode 100644 index 0000000..000af81 --- /dev/null +++ b/DATA_SuPER/DataV_SuPER.py @@ -0,0 +1,177 @@ +""" +访问阿里云 DataV 下载的行政区边界数据保存为原始 JSON, 并清洗后保存为 GeoJSON. + +- 官方地址: https://datav.aliyun.com/portal/school/atlas/area_selector +- Step1: 按"城市名称"解析为行政区划代码 (adcode); +- Step2: 将 DataV 原始数据保存为 `城市名称.json` 文件; +- Step3: 移除不兼容 GDAL/GeoPandas 的属性字段 (parent, center, centroid, acroutes); +- Step4: 将清洗后的结果写出为 `城市名称.geojson` 文件. + +------------------------------------------------------------------------------- +Authors: Hong Xie +Last Updated: 2025-10-11 +=============================================================================== +""" + +import json +import requests +from pathlib import Path +from typing import Optional +import re + + +def get_datav_json(accode: str) -> dict: + """ + 从 DataV 接口获取行政区边界的原始 JSON 数据并返回字典. + """ + # 使用路径式接口, 支持如 "420100" 或 "420100_full" + url = f"https://geo.datav.aliyun.com/areas_v3/bound/{accode}.json" + response = requests.get(url, timeout=15) + response.raise_for_status() + return response.json() + + +def fetch_and_save_geojson(accode: str, city_name: str, out_dir: str) -> Path: + """ + 获取 DataV 原始数据, 先保存为 .json; 随后清洗属性并另存为 .geojson. + """ + raw_data = get_datav_json(accode) + + # 处理 features: 移除/转换不兼容 GeoPandas 的属性 + def sanitize_properties(props: dict) -> dict: + out = {} + for k, v in props.items(): + # 移除嵌套对象 + if k in ("parent","center", "centroid", "acroutes"): + continue + # 仅保留标量类型; 丢弃其他嵌套结构 + if isinstance(v, (str, int, float, bool)) or v is None: + out[k] = v + return out + + # 输出路径(确保目录存在) + out_dir_path = Path(out_dir) + out_dir_path.mkdir(parents=True, exist_ok=True) + # 先保存原始 JSON(未清洗) + raw_json_path = out_dir_path / f"{city_name}.json" + with raw_json_path.open("w", encoding="utf-8") as f: + json.dump(raw_data, f, ensure_ascii=False) + # 再保存清洗后的 GeoJSON + out_path = out_dir_path / f"{city_name}.geojson" + # 深拷贝后进行清洗, 避免影响原始数据 + data = json.loads(json.dumps(raw_data)) + features = data.get("features", []) + for feature in features: + props = feature.get("properties", {}) + feature["properties"] = sanitize_properties(props) + + # 写出为 .geojson, 确保 UTF-8 且保留中文字符 + with out_path.open("w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False) + + return out_path + + +def _normalize_name(name: str) -> str: + name = name.strip() + # 简单去除常见后缀提高匹配鲁棒性 + for suffix in ("市", "省", "地区", "盟", "自治州", "自治县", "特别行政区"): + if name.endswith(suffix): + name = name[: -len(suffix)] + return name + + +def _name_matches_exact(target: str, candidate: str) -> bool: + return _normalize_name(target) == _normalize_name(candidate) + + +def resolve_adcode_by_name(city_name: str, prefer_full: bool = False) -> Optional[str]: + """ + 通过城市名称解析 DataV 行政区划代码. + 优先遍历全国(100000_full)和各省级(full)数据进行匹配. + 返回如 "420100" 或 "420100_full", 找不到则返回 None. + """ + # 先在全国层级中尝试匹配(通常包含省级与直辖市) + try: + cn = requests.get( + "https://geo.datav.aliyun.com/areas_v3/bound/100000_full.json", + timeout=20, + ).json() + except Exception: + cn = None + + target = city_name + contains_province_candidate = None + if cn: + # 先尝试在全国数据中直接匹配省级名称 + for feat in cn.get("features", []): + props = feat.get("properties", {}) + if props.get("level") == "province": + name = props.get("name", "") + code = str(props.get("adcode", "")) + if re.fullmatch(r"\d{6}", code): + if _name_matches_exact(target, name): + return f"{code}_full" if prefer_full else code + if _normalize_name(target) in _normalize_name(name): + contains_province_candidate = code + # 从全国数据中提取省级代码, 用于后续深入搜索城市/区县 + provinces = [] + for feat in cn.get("features", []): + props = feat.get("properties", {}) + if props.get("level") == "province": + code = str(props.get("adcode", "")) + if re.fullmatch(r"\d{6}", code): + provinces.append(code) + + # 遍历各省级行政区, 精确匹配城市名 + for pcode in provinces: + try: + prov = requests.get( + f"https://geo.datav.aliyun.com/areas_v3/bound/{pcode}_full.json", + timeout=20, + ).json() + except Exception: + continue + exact_candidate = None + contains_candidate = None + for feat in prov.get("features", []): + props = feat.get("properties", {}) + level = props.get("level") + name = props.get("name", "") + code = str(props.get("adcode", "")) + # 仅考虑城市或区县, 且编码为6位数字 + if level in ("city", "district") and re.fullmatch(r"\d{6}", code): + if _name_matches_exact(target, name): + exact_candidate = code + break + # 作为回退: 包含匹配, 但不立即返回, 继续寻找精确匹配 + if _normalize_name(target) in _normalize_name(name): + contains_candidate = code + if exact_candidate: + return f"{exact_candidate}_full" if prefer_full else exact_candidate + if contains_candidate: + return f"{contains_candidate}_full" if prefer_full else contains_candidate + + # 如果城市/区县未匹配到, 回退使用省级包含匹配 + if contains_province_candidate: + return f"{contains_province_candidate}_full" if prefer_full else contains_province_candidate + return None + + +def fetch_and_save_geojson_by_name(city_name: str, out_dir: str, prefer_full: bool = False) -> Path: + """ + 通过城市名称解析 adcode, 并直接拉取与保存 GeoJSON. + """ + code = resolve_adcode_by_name(city_name, prefer_full=prefer_full) + if not code: + raise ValueError(f"无法通过名称解析到行政区划代码: {city_name}") + return fetch_and_save_geojson(code, city_name, out_dir) + + +if __name__ == "__main__": + # city_name = "武汉市" + # city_name = "十堰市" + city_name = "湖北省" + out_dir = "./data/vectors/" + out = fetch_and_save_geojson_by_name(city_name, out_dir, prefer_full=False) + print(f"Saved raw JSON and GeoJSON for {city_name}: {out}")