Scripts: Update imageAnalysis script to also backup the images

This commit is contained in:
Pieter Vander Vennet 2023-05-18 13:07:14 +02:00
parent e5cc7eec71
commit 088fbe1d07
3 changed files with 156 additions and 48 deletions

View file

@ -1,3 +1,5 @@
(To rerun the analysis: use 'scripts/generateImageAnalysis'. Delete 'features_with_*.geojson' first to force updating the OSM-dataset)
# What licenses are used? # What licenses are used?
Now that MapComplete is three-and-a-half year old, it's a good time to see what license people are using to upload their images. Now that MapComplete is three-and-a-half year old, it's a good time to see what license people are using to upload their images.

View file

@ -1,9 +1,9 @@
import * as fs from "fs" import * as fs from "fs"
import { existsSync, lstatSync, readdirSync, readFileSync } from "fs" import {existsSync, lstatSync, readdirSync, readFileSync} from "fs"
import { Utils } from "../Utils" import {Utils} from "../Utils"
import * as https from "https" import * as https from "https"
import { LayoutConfigJson } from "../Models/ThemeConfig/Json/LayoutConfigJson" import {LayoutConfigJson} from "../Models/ThemeConfig/Json/LayoutConfigJson"
import { LayerConfigJson } from "../Models/ThemeConfig/Json/LayerConfigJson" import {LayerConfigJson} from "../Models/ThemeConfig/Json/LayerConfigJson"
import xml2js from "xml2js" import xml2js from "xml2js"
export default class ScriptUtils { export default class ScriptUtils {
@ -37,14 +37,16 @@ export default class ScriptUtils {
return result return result
} }
public static DownloadFileTo(url, targetFilePath: string): void { public static DownloadFileTo(url, targetFilePath: string): Promise<void> {
console.log("Downloading ", url, "to", targetFilePath) ScriptUtils.erasableLog("Downloading", url, "to", targetFilePath)
https.get(url, (res) => { return new Promise<void>((resolve, err) => {
const filePath = fs.createWriteStream(targetFilePath) https.get(url, (res) => {
res.pipe(filePath) const filePath = fs.createWriteStream(targetFilePath)
filePath.on("finish", () => { res.pipe(filePath)
filePath.close() filePath.on("finish", () => {
console.log("Download Completed") filePath.close()
resolve()
})
}) })
}) })
} }
@ -78,13 +80,13 @@ export default class ScriptUtils {
.filter((path) => path.indexOf("license_info.json") < 0) .filter((path) => path.indexOf("license_info.json") < 0)
.map((path) => { .map((path) => {
try { try {
const contents = readFileSync(path, { encoding: "utf8" }) const contents = readFileSync(path, {encoding: "utf8"})
if (contents === "") { if (contents === "") {
throw "The file " + path + " is empty, did you properly save?" throw "The file " + path + " is empty, did you properly save?"
} }
const parsed = JSON.parse(contents) const parsed = JSON.parse(contents)
return { parsed, path } return {parsed, path}
} catch (e) { } catch (e) {
console.error("Could not parse file ", "./assets/layers/" + path, "due to ", e) console.error("Could not parse file ", "./assets/layers/" + path, "due to ", e)
throw e throw e
@ -101,12 +103,12 @@ export default class ScriptUtils {
public static getThemeFiles(): { parsed: LayoutConfigJson; path: string }[] { public static getThemeFiles(): { parsed: LayoutConfigJson; path: string }[] {
return this.getThemePaths().map((path) => { return this.getThemePaths().map((path) => {
try { try {
const contents = readFileSync(path, { encoding: "utf8" }) const contents = readFileSync(path, {encoding: "utf8"})
if (contents === "") { if (contents === "") {
throw "The file " + path + " is empty, did you properly save?" throw "The file " + path + " is empty, did you properly save?"
} }
const parsed = JSON.parse(contents) const parsed = JSON.parse(contents)
return { parsed: parsed, path: path } return {parsed: parsed, path: path}
} catch (e) { } catch (e) {
console.error("Could not read file ", path, "due to ", e) console.error("Could not read file ", path, "due to ", e)
throw e throw e
@ -125,14 +127,14 @@ export default class ScriptUtils {
if (!existsSync(path)) { if (!existsSync(path)) {
throw "File not found: " + path throw "File not found: " + path
} }
const root = await xml2js.parseStringPromise(readFileSync(path, { encoding: "utf8" })) const root = await xml2js.parseStringPromise(readFileSync(path, {encoding: "utf8"}))
return root.svg return root.svg
} }
public static ReadSvgSync(path: string, callback: (svg: any) => void): any { public static ReadSvgSync(path: string, callback: (svg: any) => void): any {
xml2js.parseString( xml2js.parseString(
readFileSync(path, { encoding: "utf8" }), readFileSync(path, {encoding: "utf8"}),
{ async: false }, {async: false},
(err, root) => { (err, root) => {
if (err) { if (err) {
throw err throw err
@ -171,7 +173,7 @@ export default class ScriptUtils {
}) })
res.addListener("end", function () { res.addListener("end", function () {
resolve({ content: parts.join("") }) resolve({content: parts.join("")})
}) })
} }
) )

View file

@ -1,14 +1,15 @@
import Script from "./Script" import Script from "./Script"
import { Overpass } from "../Logic/Osm/Overpass" import {Overpass} from "../Logic/Osm/Overpass"
import { RegexTag } from "../Logic/Tags/RegexTag" import {RegexTag} from "../Logic/Tags/RegexTag"
import { ImmutableStore } from "../Logic/UIEventSource" import {ImmutableStore} from "../Logic/UIEventSource"
import { BBox } from "../Logic/BBox" import {BBox} from "../Logic/BBox"
import * as fs from "fs" import * as fs from "fs"
import { Feature } from "geojson" import {Feature} from "geojson"
import ScriptUtils from "./ScriptUtils" import ScriptUtils from "./ScriptUtils"
import { Imgur } from "../Logic/ImageProviders/Imgur" import {Imgur} from "../Logic/ImageProviders/Imgur"
import { LicenseInfo } from "../Logic/ImageProviders/LicenseInfo" import {LicenseInfo} from "../Logic/ImageProviders/LicenseInfo"
import { Utils } from "../Utils" import {Utils} from "../Utils"
import Constants from "../Models/Constants";
export default class GenerateImageAnalysis extends Script { export default class GenerateImageAnalysis extends Script {
constructor() { constructor() {
@ -17,9 +18,9 @@ export default class GenerateImageAnalysis extends Script {
) )
} }
async fetchImages(key: string, datapath: string): Promise<void> { async fetchImages(key: string, datapath: string, refresh: boolean): Promise<void> {
const targetPath = `${datapath}/features_with_${key.replace(/[:\/]/, "_")}.geojson` const targetPath = `${datapath}/features_with_${key.replace(/[:\/]/, "_")}.geojson`
if (fs.existsSync(targetPath)) { if (fs.existsSync(targetPath) && !refresh) {
console.log("Skipping", key) console.log("Skipping", key)
return return
} }
@ -27,27 +28,26 @@ export default class GenerateImageAnalysis extends Script {
const overpass = new Overpass( const overpass = new Overpass(
tag, tag,
[], [],
"https://overpass.kumi.systems/api/interpreter", Constants.defaultOverpassUrls[0], //"https://overpass.kumi.systems/api/interpreter",
new ImmutableStore(500), new ImmutableStore(500),
undefined, undefined,
false false
) )
console.log("Starting query...") console.log("Starting query...")
const data = await overpass.queryGeoJson(BBox.global) const data = await overpass.queryGeoJson(BBox.global)
console.log("Got data: ", data[0].features.length) console.log("Got data:", data[0].features.length, "items; timestamp:", data[1].toISOString())
fs.writeFileSync(targetPath, JSON.stringify(data[0]), "utf8") fs.writeFileSync(targetPath, JSON.stringify(data[0]), "utf8")
console.log("Written", targetPath) console.log("Written", targetPath)
} }
async downloadData(datapath: string): Promise<void> { async downloadData(datapath: string, refresh: boolean): Promise<void> {
if (!fs.existsSync(datapath)) { if (!fs.existsSync(datapath)) {
fs.mkdirSync(datapath) fs.mkdirSync(datapath)
} }
await this.fetchImages("image", datapath, refresh)
await this.fetchImages("image", datapath) await this.fetchImages("image:streetsign", datapath, refresh)
await this.fetchImages("image:streetsign", datapath)
for (let i = 0; i < 5; i++) { for (let i = 0; i < 5; i++) {
await this.fetchImages("image:" + i, datapath) await this.fetchImages("image:" + i, datapath, refresh)
} }
} }
@ -73,25 +73,55 @@ export default class GenerateImageAnalysis extends Script {
if (!image.match(/https:\/\/i\.imgur\.com\/[a-zA-Z0-9]+\.jpg/)) { if (!image.match(/https:\/\/i\.imgur\.com\/[a-zA-Z0-9]+\.jpg/)) {
return false return false
} }
const targetPath = datapath + "/" + image.replace(/[\/:.\-%]/g, "_") + ".json" const filename = image.replace(/[\/:.\-%]/g, "_") + ".json"
const targetPath = datapath + "/" + filename
if (fs.existsSync(targetPath)) { if (fs.existsSync(targetPath)) {
return false return false
} }
const attribution = await Imgur.singleton.DownloadAttribution(image) const attribution = await Imgur.singleton.DownloadAttribution(image)
if ((attribution.artist ?? "") === "") {
// This is an invalid attribution. We save the raw response as well
const hash = image.substr("https://i.imgur.com/".length).split(".jpg")[0]
const apiUrl = "https://api.imgur.com/3/image/" + hash
const response = await Utils.downloadJsonCached(apiUrl, 365 * 24 * 60 * 60, {
Authorization: "Client-ID " + Constants.ImgurApiKey,
})
const rawTarget = datapath + "/raw/" + filename
console.log("Also storing the raw response to", rawTarget)
await fs.writeFileSync(rawTarget, JSON.stringify(response, null, " "))
}
await fs.writeFileSync(targetPath, JSON.stringify(attribution, null, " ")) await fs.writeFileSync(targetPath, JSON.stringify(attribution, null, " "))
return true return true
} }
async downloadMetadata(datapath: string): Promise<void> { loadImageUrls(datapath: string): { allImages: Set<string>, imageSource: Map<string, string> } {
const features = this.loadData(datapath)
let allImages = new Set<string>() let allImages = new Set<string>()
const features = this.loadData(datapath)
let imageSource: Map<string, string> = new Map<string, string>()
for (const feature of features) { for (const feature of features) {
allImages.add(feature.properties["image"]) allImages.add(feature.properties["image"])
imageSource[feature.properties["image"]] = feature.properties.id
allImages.add(feature.properties["image:streetsign"])
imageSource[feature.properties["image:streetsign"]] = feature.properties.id + " (streetsign)"
for (let i = 0; i < 10; i++) { for (let i = 0; i < 10; i++) {
allImages.add(feature.properties["image:" + i]) allImages.add(feature.properties["image:" + i])
imageSource[feature.properties["image:" + i]] = `${feature.properties.id} (image:${i})`
} }
} }
allImages.delete(undefined)
allImages.delete(null)
imageSource.delete(undefined)
imageSource.delete(null)
return {allImages, imageSource}
}
async downloadMetadata(datapath: string): Promise<void> {
const {allImages, imageSource} = this.loadImageUrls(datapath)
console.log("Detected", allImages.size, "images") console.log("Detected", allImages.size, "images")
let i = 0 let i = 0
let d = 0 let d = 0
@ -113,10 +143,9 @@ export default class GenerateImageAnalysis extends Script {
} downloaded: ${d},skipped: ${s}, failed: ${f}, running: ${Math.floor( } downloaded: ${d},skipped: ${s}, failed: ${f}, running: ${Math.floor(
runningSecs runningSecs
)}sec, ETA: ${estimatedActualMinutes}:${estimatedActualSeconds % 60}` )}sec, ETA: ${estimatedActualMinutes}:${estimatedActualSeconds % 60}`
ScriptUtils.erasableLog( if (d + f % 1000 === 1 || downloaded) {
" ", ScriptUtils.erasableLog(msg)
msg }
)
if (downloaded) { if (downloaded) {
d++ d++
} else { } else {
@ -124,10 +153,80 @@ export default class GenerateImageAnalysis extends Script {
} }
if (d + f == 75000) { if (d + f == 75000) {
console.log("Used 75000 API calls, leaving 5000 for the rest of the day...") console.log("Used 75000 API calls, leaving 5000 for the rest of the day...")
break
}
} catch (e) {
// console.log(e)
console.log("Offending image hash is", image, "from https://openstreetmap.org/" + imageSource[image])
f++
}
}
}
async downloadImage(url: string, imagePath: string): Promise<boolean> {
const filenameLong = url.replace(/[\/:.\-%]/g, "_") + ".jpg"
const targetPathLong = imagePath + "/" + filenameLong
const filename = url.substring("https://i.imgur.com/".length)
const targetPath = imagePath + "/" + filename
if (fs.existsSync(targetPathLong)) {
if (fs.existsSync(targetPath)) {
fs.unlinkSync(targetPathLong)
console.log("Unlinking duplicate")
return false
}
console.log("Renaming...")
fs.renameSync(targetPathLong, targetPath)
return false
}
if (fs.existsSync(targetPath)) {
return false
}
await ScriptUtils.DownloadFileTo(url, targetPath)
return true
}
async downloadAllImages(datapath: string, imagePath: string): Promise<void> {
const {allImages} = this.loadImageUrls(datapath)
let skipped = 0
let failed = 0
let downloaded = 0
let invalid = 0
const startTime = Date.now()
const urls = Array.from(allImages).filter(url => url.startsWith("https://i.imgur.com"))
for (const url of urls) {
const runningTime = ((Date.now()) - startTime) / 1000
const handled = skipped + downloaded + failed
const itemsLeft = allImages.size - handled
const speed = handled / runningTime
const timeLeft = Math.round(itemsLeft * speed)
try {
const downloadedStatus = await Promise.all(url.split(";").map(url =>
this.downloadImage(url.trim(), imagePath),
))
for (const b of downloadedStatus) {
if (b) {
downloaded += 1
} else {
skipped += 1
}
}
if (downloadedStatus.some(i => i) || skipped % 10000 === 0) {
console.log("Handled", url, JSON.stringify({
skipped,
failed,
downloaded,
invalid,
total: allImages.size,
eta: timeLeft + "s"
}))
} }
} catch (e) { } catch (e) {
console.log(e) console.log(e)
f++ failed++
} }
} }
} }
@ -141,7 +240,7 @@ export default class GenerateImageAnalysis extends Script {
if (!file.endsWith(".json")) { if (!file.endsWith(".json")) {
continue continue
} }
const attr = <LicenseInfo>JSON.parse(fs.readFileSync(file, { encoding: "utf8" })) const attr = <LicenseInfo>JSON.parse(fs.readFileSync(file, {encoding: "utf8"}))
const license = attr.licenseShortName const license = attr.licenseShortName
if (license === undefined || attr.artist === undefined) { if (license === undefined || attr.artist === undefined) {
@ -220,7 +319,7 @@ export default class GenerateImageAnalysis extends Script {
...Array.from(licenseByAuthor.get("CC-BY-SA 4.0").values()), ...Array.from(licenseByAuthor.get("CC-BY-SA 4.0").values()),
] ]
console.log("Total number of correctly licenses pictures: ", totalLicensedImages) console.log("Total number of correctly licenses pictures: ", totalLicensedImages, "(out of ", files.length, " images)")
console.log("Total number of authors:", byAuthor.size) console.log("Total number of authors:", byAuthor.size)
console.log( console.log(
"Total number of authors which used a valid, non CC0 license at one point in time", "Total number of authors which used a valid, non CC0 license at one point in time",
@ -230,10 +329,15 @@ export default class GenerateImageAnalysis extends Script {
} }
async main(args: string[]): Promise<void> { async main(args: string[]): Promise<void> {
console.log("Usage: [--cached] to use the cached osm data")
console.log("Args are", args)
const cached = args.indexOf("--cached") < 0
args = args.filter(a => a !== "--cached")
const datapath = args[0] ?? "../../git/MapComplete-data/ImageLicenseInfo" const datapath = args[0] ?? "../../git/MapComplete-data/ImageLicenseInfo"
await this.downloadData(datapath) await this.downloadData(datapath, cached)
await this.downloadMetadata(datapath) await this.downloadMetadata(datapath)
await this.downloadAllImages(datapath, "/home/pietervdvn/data/imgur-image-backup")
this.analyze(datapath) this.analyze(datapath)
} }
} }