osm_wikidata_check/osm_wikidata_check.py

94 lines
2.1 KiB
Python
Raw Normal View History

2022-12-18 21:18:01 +01:00
#!/usr/bin/env python3
import sys
2022-12-18 22:14:44 +01:00
import json
2022-12-18 21:18:01 +01:00
import requests
from ipo import ipo, opi, p, read, flatten
2022-12-18 22:14:44 +01:00
def ids(elements):
2022-12-18 21:18:01 +01:00
return sorted(set(
2022-12-18 22:14:44 +01:00
ipo(elements) |
p(map)(lambda x: x["tags"].items() if "tags" in x else []) |
flatten |
p(filter)(lambda kv: kv[0] == "wikidata" or kv[0].endswith(":wikidata")) |
p(map)(lambda kv: kv[1]) |
2022-12-18 21:18:01 +01:00
p(map)(
2022-12-18 22:14:44 +01:00
p(str.split)(sep=";")
2022-12-18 21:18:01 +01:00
) |
flatten |
p(map)(str.strip) |
# Select non-empty lines
p(filter)(bool) |
opi
))
def first(iterable, default=None):
return next(iter(iterable), default)
def overpass_query(wd_ids):
return f""" [out:json][timeout:25]; (
{overpass_selectors(wd_ids)}
); out body; >; out skel qt;"""
def overpass_selector(wd_id):
2022-12-18 23:25:26 +01:00
return ' nwr["wikidata"="' + wd_id + '"]({{bbox}});\n nwr[~":wikidata$"~"' + wd_id + '"]({{bbox}});'
def overpass_selectors(wd_ids):
return "\n".join(overpass_selector(wd_id) for wd_id in wd_ids)
2022-12-18 21:18:01 +01:00
s = requests.Session()
2022-12-18 22:14:44 +01:00
data = json.load(sys.stdin)
assert data.get("version") == 0.6, "Expecting OpenStreetMap data on standard input"
assert "elements" in data
2022-12-18 21:18:01 +01:00
notfound = []
redirects = []
2022-12-18 22:15:36 +01:00
headers = {
"Accept-Encoding": "gzip,deflate",
"Accept": "application/json",
"User-Agent": "osm_wikidata_check"
}
2022-12-18 21:18:01 +01:00
try:
print(" Wikidata ID | Redirect/issue | Label")
2022-12-18 22:14:44 +01:00
for wd_id in ids(data["elements"]):
2022-12-18 22:15:36 +01:00
r = s.get(f"https://www.wikidata.org/entity/{wd_id}", headers=headers)
2022-12-18 21:18:01 +01:00
if not r.ok:
print(f"{wd_id:>15} | NOT FOUND! |\t")
notfound.append(wd_id)
else:
redirect = ""
datapage = r.json()
try:
data = datapage["entities"][wd_id]
except KeyError:
new_id, data = first(datapage["entities"].items())
redirect = f"{new_id}"
redirects.append(wd_id)
labels = data.get("labels") or data["lemmas"]
label = (labels.get("nl") or labels["en"])["value"]
if len(label) > 46:
2022-12-18 23:03:27 +01:00
label = f"{label:>.45}"
2022-12-18 21:18:01 +01:00
print(f"{wd_id:>15} | {redirect:>15} | {label}")
finally:
print()
if notfound:
print(f"Not found:")
print(overpass_query(notfound))
print()
if redirects:
print(f"Redirects: {redirects}")
print(overpass_query(redirects))
print()