Some more experimentation with the conflation script
This commit is contained in:
parent
8eda65a24f
commit
99cb879cfe
3 changed files with 321 additions and 102 deletions
|
@ -46,7 +46,8 @@
|
|||
"weblate-merge": "git remote update weblate-github; git merge weblate-github/weblate-mapcomplete-core weblate-github/weblate-mapcomplete-layers weblate-github/weblate-mapcomplete-layer-translations",
|
||||
"weblate-fix-heavy": "git fetch weblate-hosted-layers; git fetch weblate-hosted-core; git merge weblate-hosted-layers/master weblate-hosted-core/master ",
|
||||
"housekeeping": "git pull && npm run weblate-fix-heavy && npm run generate && npm run generate:docs && npm run generate:contributor-list && vite-node scripts/fetchLanguages.ts && npm run format && git add assets/ langs/ Docs/ **/*.ts Docs/* && git commit -m 'chore: automated housekeeping...'",
|
||||
"parseSchools": "vite-node scripts/schools/amendSchoolData.ts"
|
||||
"parseSchools": "vite-node scripts/schools/amendSchoolData.ts",
|
||||
"conflate": "vite-node scripts/conflate.ts -- ../onwheels-data-prep/osm_pharmacies.geojson ../onwheels-data-prep/OnWheelsData_apotheek.geojson"
|
||||
},
|
||||
"keywords": [
|
||||
"OpenStreetMap",
|
||||
|
|
|
@ -146,17 +146,20 @@ export default class ScriptUtils {
|
|||
|
||||
private static async DownloadJSON(url: string, headers?: any): Promise<any> {
|
||||
const data = await ScriptUtils.Download(url, headers)
|
||||
return JSON.parse(data.content)
|
||||
return JSON.parse(data["content"])
|
||||
}
|
||||
|
||||
private static Download(url: string, headers?: any): Promise<{ content: string }> {
|
||||
public static Download(
|
||||
url: string,
|
||||
headers?: any
|
||||
): Promise<{ content: string } | { redirect: string }> {
|
||||
return new Promise((resolve, reject) => {
|
||||
try {
|
||||
headers = headers ?? {}
|
||||
headers.accept = "application/json"
|
||||
console.log(" > ScriptUtils.DownloadJson(", url, ")")
|
||||
console.log(" > ScriptUtils.Download(", url, ")")
|
||||
const urlObj = new URL(url)
|
||||
https.get(
|
||||
const request = https.get(
|
||||
{
|
||||
host: urlObj.host,
|
||||
path: urlObj.pathname + urlObj.search,
|
||||
|
@ -173,10 +176,26 @@ export default class ScriptUtils {
|
|||
})
|
||||
|
||||
res.addListener("end", function () {
|
||||
if (res.statusCode === 301 || res.statusCode === 302) {
|
||||
console.log("Got a redirect:", res.headers.location)
|
||||
resolve({ redirect: res.headers.location })
|
||||
}
|
||||
if (res.statusCode >= 400) {
|
||||
console.log(
|
||||
"Error while fetching ",
|
||||
url,
|
||||
"due to",
|
||||
res.statusMessage
|
||||
)
|
||||
reject(res.statusCode)
|
||||
}
|
||||
resolve({ content: parts.join("") })
|
||||
})
|
||||
}
|
||||
)
|
||||
request.on("error", function (e) {
|
||||
reject(e)
|
||||
})
|
||||
} catch (e) {
|
||||
reject(e)
|
||||
}
|
||||
|
|
|
@ -4,8 +4,31 @@ import { Feature } from "geojson"
|
|||
import { GeoOperations } from "../Logic/GeoOperations"
|
||||
import { Utils } from "../Utils"
|
||||
import { OsmObject } from "../Logic/Osm/OsmObject"
|
||||
import { PhoneTextField, UrlTextfieldDef } from "../UI/Input/ValidatedTextField"
|
||||
import { OsmId } from "../Models/OsmFeature"
|
||||
import ScriptUtils from "./ScriptUtils"
|
||||
|
||||
interface PossibleMatch {
|
||||
/**
|
||||
* Distance in meter between the OSM-data and the external dataset
|
||||
*/
|
||||
d: number
|
||||
|
||||
osm_feature: Feature
|
||||
external_feature: Feature
|
||||
}
|
||||
|
||||
interface ReplayResult {
|
||||
certainly_imported?: boolean
|
||||
possibly_imported?: boolean
|
||||
resting_properties?: Record<string, string>
|
||||
}
|
||||
|
||||
export class Conflate extends Script {
|
||||
private earliestDate: Date = undefined
|
||||
private latestDate: Date = undefined
|
||||
private readonly historyCacheDir = "/tmp/cache/"
|
||||
|
||||
constructor() {
|
||||
super(
|
||||
[
|
||||
|
@ -22,10 +45,88 @@ export class Conflate extends Script {
|
|||
)
|
||||
}
|
||||
|
||||
async main(args: string[]): Promise<void> {
|
||||
const [osm_file_path, external_file_path] = args
|
||||
let max_range = 50
|
||||
if (args.length === 3) {
|
||||
max_range = Number(args[2])
|
||||
}
|
||||
if (
|
||||
osm_file_path.toLowerCase().indexOf("osm") < 0 &&
|
||||
osm_file_path.toLowerCase().indexOf("openstreetmap") < 0
|
||||
) {
|
||||
throw "OSM File path must contain 'osm' or 'openStreetMap'"
|
||||
}
|
||||
|
||||
if (
|
||||
external_file_path.toLowerCase().indexOf("osm") >= 0 ||
|
||||
external_file_path.toLowerCase().indexOf("openstreetmap") >= 0
|
||||
) {
|
||||
throw "External File path may not contain 'osm' or 'openStreetMap'"
|
||||
}
|
||||
|
||||
const external_features: Feature[] = JSON.parse(
|
||||
fs.readFileSync(external_file_path, { encoding: "utf-8" })
|
||||
).features
|
||||
const osm_features: Feature[] = JSON.parse(
|
||||
fs.readFileSync(osm_file_path, { encoding: "utf-8" })
|
||||
).features
|
||||
|
||||
const bestMatches = await this.calculateMatches(external_features, osm_features, max_range)
|
||||
const unmatched = external_features.filter(
|
||||
(f) => !bestMatches.some((matched) => matched.match.external_feature === f)
|
||||
)
|
||||
const match_lengths: (string | number)[][] = [
|
||||
[
|
||||
"osm_id",
|
||||
"match_distance",
|
||||
"osm_name",
|
||||
"imported",
|
||||
"status_external",
|
||||
"...properties_differences",
|
||||
],
|
||||
]
|
||||
for (const { match, replayed } of bestMatches) {
|
||||
const { external_feature, d, osm_feature } = match
|
||||
const { possibly_imported, certainly_imported, resting_properties } = replayed
|
||||
const status = resting_properties["status"]
|
||||
delete resting_properties["status"]
|
||||
if (Object.keys(resting_properties).length === 0) {
|
||||
continue
|
||||
}
|
||||
match_lengths.push([
|
||||
osm_feature.properties["@id"],
|
||||
d,
|
||||
osm_feature.properties.name,
|
||||
certainly_imported ? "import" : possibly_imported ? "prob import" : "new",
|
||||
status,
|
||||
JSON.stringify(resting_properties),
|
||||
])
|
||||
}
|
||||
|
||||
fs.writeFileSync(
|
||||
"../onwheels-data-prep/matches.tsv",
|
||||
match_lengths.map((l) => l.join("\t")).join("\n")
|
||||
)
|
||||
|
||||
fs.writeFileSync(
|
||||
"../onwheels-data-prep/unmatched.geojson",
|
||||
JSON.stringify(
|
||||
{
|
||||
type: "FeatureCollection",
|
||||
features: unmatched,
|
||||
},
|
||||
|
||||
null,
|
||||
" "
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
private async findTimeFork(
|
||||
externalName: string,
|
||||
osmName: string,
|
||||
osmId: string
|
||||
osmId: OsmId
|
||||
): Promise<{ earliestDateOfImport; latestDateOfImport }> {
|
||||
const history = await OsmObject.DownloadHistory(osmId).AsPromise((h) => h.length > 0)
|
||||
let earliest: Date = undefined
|
||||
|
@ -60,106 +161,204 @@ export class Conflate extends Script {
|
|||
return { earliestDateOfImport: earliest, latestDateOfImport: latest }
|
||||
}
|
||||
|
||||
private earliestDate: Date = undefined
|
||||
private latestDate: Date = undefined
|
||||
private findPossibleMatchesFor(
|
||||
osm_features: Feature[],
|
||||
externalFeature: Feature,
|
||||
max_range: number
|
||||
): PossibleMatch[] {
|
||||
const possibleMatches: PossibleMatch[] = []
|
||||
for (const osmFeature of osm_features) {
|
||||
const d = GeoOperations.distanceBetween(
|
||||
GeoOperations.centerpointCoordinates(externalFeature),
|
||||
GeoOperations.centerpointCoordinates(osmFeature)
|
||||
)
|
||||
|
||||
async main(args: string[]): Promise<void> {
|
||||
const [osm_file_path, external_file_path] = args
|
||||
let max_range = 50
|
||||
if (args.length === 3) {
|
||||
max_range = Number(args[2])
|
||||
}
|
||||
if (
|
||||
osm_file_path.toLowerCase().indexOf("osm") < 0 &&
|
||||
osm_file_path.toLowerCase().indexOf("openstreetmap") < 0
|
||||
) {
|
||||
throw "OSM File path must contain 'osm' or 'openStreetMap'"
|
||||
}
|
||||
|
||||
if (
|
||||
external_file_path.toLowerCase().indexOf("osm") >= 0 ||
|
||||
external_file_path.toLowerCase().indexOf("openstreetmap") >= 0
|
||||
) {
|
||||
throw "External File path may not contain 'osm' or 'openStreetMap'"
|
||||
}
|
||||
|
||||
const external_features: Feature[] = JSON.parse(
|
||||
fs.readFileSync(external_file_path, { encoding: "utf-8" })
|
||||
).features
|
||||
const osm_features: Feature[] = JSON.parse(
|
||||
fs.readFileSync(osm_file_path, { encoding: "utf-8" })
|
||||
).features
|
||||
|
||||
const match_lengths: (string | number)[][] = [
|
||||
[
|
||||
"osm_id",
|
||||
"external_index",
|
||||
"match_distance",
|
||||
"name_levenshtein_distance",
|
||||
"osm_data",
|
||||
"external_data",
|
||||
"status",
|
||||
],
|
||||
]
|
||||
for (let i = 0; i < external_features.length; i++) {
|
||||
// console.log("Inspecting " + (i + 1) + "/" + external_features.length)
|
||||
const externalFeature = external_features[i]
|
||||
const possibleMatches: number[] = []
|
||||
for (const osmFeature of osm_features) {
|
||||
const d = GeoOperations.distanceBetween(
|
||||
GeoOperations.centerpointCoordinates(externalFeature),
|
||||
GeoOperations.centerpointCoordinates(osmFeature)
|
||||
)
|
||||
|
||||
if (d === 0) {
|
||||
console.log(
|
||||
"Found an exact match (name match: ",
|
||||
osmFeature.properties.name === externalFeature.properties.name,
|
||||
osmFeature.properties.name,
|
||||
externalFeature.properties.name
|
||||
)
|
||||
continue
|
||||
}
|
||||
continue
|
||||
if (d < max_range) {
|
||||
console.log("Found a match")
|
||||
match_lengths.push([
|
||||
osmFeature.properties["@id"],
|
||||
(i + " " + possibleMatches.join(",")).trim(),
|
||||
d,
|
||||
this.levenshteinDistancePharmacy(
|
||||
externalFeature.properties.name,
|
||||
osmFeature.properties.name
|
||||
),
|
||||
externalFeature.properties.status,
|
||||
...this.conflate(osmFeature.properties, externalFeature.properties),
|
||||
])
|
||||
possibleMatches.push(osmFeature.properties["@id"])
|
||||
/*
|
||||
possibleMatches.push({
|
||||
osmFeature,
|
||||
d,
|
||||
nameDist: Utils.levenshteinDistance(
|
||||
osmFeature.properties.name,
|
||||
externalFeature.properties.name
|
||||
),
|
||||
})//*/
|
||||
}
|
||||
// possibleMatches.sort((a, b) => b.d - a.d)
|
||||
if (d < max_range) {
|
||||
possibleMatches.push({
|
||||
external_feature: externalFeature,
|
||||
osm_feature: osmFeature,
|
||||
d,
|
||||
})
|
||||
}
|
||||
}
|
||||
match_lengths.sort((a, b) => <number>b[1] - <number>a[1])
|
||||
console.log(
|
||||
"The import probably happened between ",
|
||||
this.earliestDate?.toISOString(),
|
||||
"and",
|
||||
this.latestDate?.toISOString()
|
||||
return possibleMatches
|
||||
}
|
||||
|
||||
private async stillOnline(url: string): Promise<boolean | string> {
|
||||
// return true
|
||||
if (url.indexOf("facebook.com") > 0) {
|
||||
return true
|
||||
}
|
||||
const cachePath = this.historyCacheDir + "/urls/ " + url.replace(/[/\\:]/g, "_")
|
||||
if (fs.existsSync(cachePath)) {
|
||||
const online = JSON.parse(fs.readFileSync(cachePath, { encoding: "utf-8" }))
|
||||
return online
|
||||
}
|
||||
let online: boolean | string = false
|
||||
try {
|
||||
online = await this.stillOnlineUncached(url)
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
const urlObj = new URL(url)
|
||||
if (e === "NOT_FOUND" && urlObj.pathname.length > 0) {
|
||||
console.log("Maybe trying the homepage will help?")
|
||||
}
|
||||
}
|
||||
fs.writeFileSync(cachePath, JSON.stringify(online, null, " "), { encoding: "utf-8" })
|
||||
return online
|
||||
}
|
||||
|
||||
private async stillOnlineUncached(url: string): Promise<boolean | string> {
|
||||
if (!url.startsWith("http")) {
|
||||
url = "https://" + url
|
||||
}
|
||||
url = url.replace("http://", "https://")
|
||||
try {
|
||||
const result = await ScriptUtils.Download(url)
|
||||
if (result["redirect"]) {
|
||||
if (result["redirect"].startsWith("/")) {
|
||||
return true
|
||||
}
|
||||
return result["redirect"]
|
||||
}
|
||||
if (result["content"]) {
|
||||
return true
|
||||
}
|
||||
console.error("Got a result, but no content?", url, result)
|
||||
} catch (e) {
|
||||
console.log("Offline (error):", url, e.message)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
private async historyCached(id): Promise<OsmObject[]> {
|
||||
const cachePath = this.historyCacheDir + "/" + id.replace("/", "_")
|
||||
if (fs.existsSync(cachePath)) {
|
||||
return JSON.parse(fs.readFileSync(cachePath, { encoding: "utf-8" }))
|
||||
}
|
||||
const history = await OsmObject.DownloadHistory(id).AsPromise((l) => l.length > 0)
|
||||
fs.writeFileSync(cachePath, JSON.stringify(history, null, " "), { encoding: "utf-8" })
|
||||
return history
|
||||
}
|
||||
|
||||
private async normalize(properties: Record<string, string>) {
|
||||
if (properties["phone"]) {
|
||||
properties["phone"] = new PhoneTextField().reformat(properties["phone"], () => "be")
|
||||
}
|
||||
if (properties["website"]) {
|
||||
let website = properties.website.toLowerCase()
|
||||
website
|
||||
.replace("http://http://", "http://")
|
||||
.replace("https//", "https://")
|
||||
.replace("http://", "https://")
|
||||
const validator = new UrlTextfieldDef()
|
||||
if (validator.isValid(website)) {
|
||||
properties.website = new UrlTextfieldDef().reformat(website)
|
||||
const stillOnline = await this.stillOnline(website)
|
||||
if (stillOnline === false) {
|
||||
delete properties.website
|
||||
}
|
||||
if (typeof stillOnline === "string") {
|
||||
properties.website = stillOnline
|
||||
}
|
||||
} else {
|
||||
console.log("Invalid url:", website)
|
||||
}
|
||||
}
|
||||
|
||||
if (properties["healthcare"] === "pharmacy") {
|
||||
// we don't care about this tag
|
||||
delete properties["healthcare"]
|
||||
}
|
||||
}
|
||||
|
||||
private async replay(match: PossibleMatch): Promise<ReplayResult> {
|
||||
const history = await this.historyCached(match.osm_feature.properties["@id"])
|
||||
|
||||
let certainly_imported = match.d < 0.0001
|
||||
let possibly_imported = false
|
||||
|
||||
const resting_properties = { ...match.external_feature.properties }
|
||||
await this.normalize(resting_properties)
|
||||
|
||||
for (const historyElement of history) {
|
||||
await this.normalize(historyElement.tags)
|
||||
|
||||
if (historyElement.tags.name === resting_properties.name) {
|
||||
possibly_imported = true
|
||||
}
|
||||
|
||||
for (const key in resting_properties) {
|
||||
if (this.str_compare(historyElement.tags[key], resting_properties[key])) {
|
||||
delete resting_properties[key]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
certainly_imported,
|
||||
possibly_imported,
|
||||
resting_properties,
|
||||
}
|
||||
}
|
||||
|
||||
private str_compare(a, b): boolean {
|
||||
if (a === undefined || b === undefined) {
|
||||
return false
|
||||
}
|
||||
a = a.toLowerCase().replaceAll(/[éèáàüë].*$/g, "")
|
||||
b = b.toLowerCase().replaceAll(/[éèáàüë].*$/g, "")
|
||||
|
||||
return a === b
|
||||
}
|
||||
|
||||
private async calculateMatches(
|
||||
external_features: Feature[],
|
||||
osm_features: Feature[],
|
||||
max_range: number
|
||||
): Promise<{ match: PossibleMatch; replayed: ReplayResult }[]> {
|
||||
const matches: { match: PossibleMatch; replayed: ReplayResult }[] = []
|
||||
for (const f of external_features) {
|
||||
const match = await this.calculateMatch(osm_features, f, max_range)
|
||||
if (match) {
|
||||
matches.push(match)
|
||||
}
|
||||
}
|
||||
return matches
|
||||
}
|
||||
|
||||
private async calculateMatch(
|
||||
osm_features: Feature[],
|
||||
externalFeature: Feature,
|
||||
max_range: number
|
||||
): Promise<{ match: PossibleMatch; replayed: ReplayResult }> {
|
||||
const possibleMatches = this.findPossibleMatchesFor(
|
||||
osm_features,
|
||||
externalFeature,
|
||||
max_range
|
||||
)
|
||||
fs.writeFileSync(
|
||||
"../onwheels-data-prep/match_lengths.tsv",
|
||||
match_lengths.map((l) => l.join("\t")).join("\n")
|
||||
)
|
||||
console.log(match_lengths)
|
||||
let bestMatch: PossibleMatch = undefined
|
||||
let bestMatchReplayed: ReplayResult = undefined
|
||||
for (const possibleMatch of possibleMatches) {
|
||||
const replayed = await this.replay(possibleMatch)
|
||||
if (
|
||||
bestMatch === undefined ||
|
||||
(replayed.certainly_imported && !bestMatchReplayed.possibly_imported) ||
|
||||
(!bestMatchReplayed.certainly_imported &&
|
||||
replayed.possibly_imported &&
|
||||
!bestMatchReplayed.possibly_imported)
|
||||
) {
|
||||
bestMatch = possibleMatch
|
||||
bestMatchReplayed = replayed
|
||||
}
|
||||
}
|
||||
if (bestMatch === undefined) {
|
||||
return undefined
|
||||
}
|
||||
return {
|
||||
replayed: bestMatchReplayed,
|
||||
match: bestMatch,
|
||||
}
|
||||
}
|
||||
|
||||
private levenshteinDistancePharmacy(a?: string, b?: string) {
|
||||
|
|
Loading…
Reference in a new issue