From 13f8bea37a7ef066e7d7370a6258100bf5311f2e Mon Sep 17 00:00:00 2001 From: Pieter Vander Vennet Date: Mon, 9 Jan 2023 20:30:13 +0100 Subject: [PATCH] Create image license analysis script --- scripts/Script.ts | 18 +++ scripts/generateImageAnalysis.ts | 197 +++++++++++++++++++++++++++++++ 2 files changed, 215 insertions(+) create mode 100644 scripts/Script.ts create mode 100644 scripts/generateImageAnalysis.ts diff --git a/scripts/Script.ts b/scripts/Script.ts new file mode 100644 index 000000000..f76785161 --- /dev/null +++ b/scripts/Script.ts @@ -0,0 +1,18 @@ +import ScriptUtils from "./ScriptUtils" + +export default abstract class Script { + private readonly _docs: string + + constructor(docs: string) { + this._docs = docs + } + + abstract main(args: string[]): Promise + + public run(): void { + ScriptUtils.fixUtils() + const args = [...process.argv] + args.splice(0, 2) + this.main(args).then((_) => console.log("All done")) + } +} diff --git a/scripts/generateImageAnalysis.ts b/scripts/generateImageAnalysis.ts new file mode 100644 index 000000000..6b5ca364b --- /dev/null +++ b/scripts/generateImageAnalysis.ts @@ -0,0 +1,197 @@ +import Script from "./Script" +import { Overpass } from "../Logic/Osm/Overpass" +import { RegexTag } from "../Logic/Tags/RegexTag" +import { ImmutableStore } from "../Logic/UIEventSource" +import { BBox } from "../Logic/BBox" +import * as fs from "fs" +import { Feature } from "geojson" +import ScriptUtils from "./ScriptUtils" +import { Imgur } from "../Logic/ImageProviders/Imgur" +import { LicenseInfo } from "../Logic/ImageProviders/LicenseInfo" +import { Utils } from "../Utils" + +export default class GenerateImageAnalysis extends Script { + constructor() { + super( + "Downloads (from overpass) all tags which have an imgur-image; then analyses the licenses" + ) + } + + async fetchImages(key: string, datapath: string): Promise { + const targetPath = `${datapath}/features_with_${key.replace(/[:\/]/, "_")}.geojson` + if (fs.existsSync(targetPath)) { + console.log("Skipping", key) + return + } + const tag = new RegexTag("image", /https:\/\/i.imgur.com\/.*/i) + const overpass = new Overpass( + tag, + [], + "https://overpass.kumi.systems/api/interpreter", + new ImmutableStore(180), + undefined, + false + ) + console.log("Starting query...") + const data = await overpass.queryGeoJson(BBox.global) + console.log("Got data: ", data[0].features.length) + fs.writeFileSync(targetPath, JSON.stringify(data[0]), "utf8") + console.log("Written", targetPath) + } + + async downloadData(datapath: string): Promise { + if (!fs.existsSync(datapath)) { + fs.mkdirSync(datapath) + } + + await this.fetchImages("image", datapath) + for (let i = 0; i < 5; i++) { + await this.fetchImages("image:" + i, datapath) + } + } + + loadData(datapath: string): Feature[] { + const allFeatures: Feature[] = [] + + const files = ScriptUtils.readDirRecSync(datapath) + for (const file of files) { + if (!file.endsWith(".geojson")) { + continue + } + const contents = JSON.parse(fs.readFileSync(file, "utf8")) + allFeatures.push(...contents.features) + } + + return allFeatures + } + + async fetchImageMetadata(datapath: string, image: string): Promise { + if (image === undefined) { + return false + } + if (image.endsWith(".png") || image.endsWith(".jpeg")) { + console.log("Skipped invalid image") + return false + } + const targetPath = datapath + "/" + image.replace(/[\/:.\-%]/g, "_") + ".json" + if (fs.existsSync(targetPath)) { + return false + } + const attribution = await Imgur.singleton.DownloadAttribution(image) + await fs.writeFileSync(targetPath, JSON.stringify(attribution, null, " ")) + return true + } + + async downloadMetadata(datapath: string): Promise { + const features = this.loadData(datapath) + let allImages = new Set() + + for (const feature of features) { + allImages.add(feature.properties["image"]) + for (let i = 0; i < 10; i++) { + allImages.add(feature.properties["image:" + i]) + } + } + console.log("Detected", allImages.size, "images") + let i = 0 + let d = 0 + let s = 0 + let f = 0 + let start = Date.now() + for (const image of Array.from(allImages)) { + i++ + try { + const downloaded = await this.fetchImageMetadata(datapath, image) + const runningSecs = (Date.now() - start) / 1000 + const left = allImages.size - i + + const estimatedActualSeconds = Math.floor((left * runningSecs) / (f + d)) + const estimatedActualMinutes = Math.floor(estimatedActualSeconds / 60) + + const msg = `${i}/${ + allImages.size + } downloaded: ${d},skipped: ${s}, failed: ${f}, running: ${runningSecs}sec, ETA: ${estimatedActualMinutes}:${ + estimatedActualSeconds % 60 + }` + console.log(msg) + if (downloaded) { + d++ + } else { + s++ + } + if (d + f == 75000) { + console.log("Used 75000 API calls, leaving 5000 for the rest of the day...") + } + } catch (e) { + console.log(e) + f++ + } + } + } + + analyze(datapath: string) { + const files = ScriptUtils.readDirRecSync(datapath) + const byAuthor = new Map() + const byLicense = new Map() + const licenseByAuthor = new Map>() + for (const file of files) { + if (!file.endsWith(".json")) { + continue + } + const attr = JSON.parse(fs.readFileSync(file, "UTF8")) + + if (byAuthor.get(attr.artist) === undefined) { + byAuthor.set(attr.artist, []) + } + byAuthor.get(attr.artist).push(file) + + const license = attr.licenseShortName + if (byLicense.get(license) === undefined) { + byLicense.set(license, []) + } + byLicense.get(license).push(file) + + if (licenseByAuthor.get(license) === undefined) { + licenseByAuthor.set(license, new Set()) + } + licenseByAuthor.get(license).add(attr.artist) + } + byAuthor.delete(undefined) + byLicense.delete(undefined) + licenseByAuthor.delete(undefined) + + const byLicenseCount = Utils.MapToObj(byLicense, (a) => a.length) + const byAuthorCount = Utils.MapToObj(byAuthor, (a) => a.length) + const licenseByAuthorCount = Utils.MapToObj(licenseByAuthor, (a) => a.size) + console.log(byAuthorCount) + console.log(byLicenseCount) + console.log(licenseByAuthorCount) + + const totalAuthors = byAuthor.size + let totalLicensedImages = 0 + for (const license in byLicenseCount) { + totalLicensedImages += byLicenseCount[license] + } + for (const license in byLicenseCount) { + const total = byLicenseCount[license] + const authors = licenseByAuthorCount[license] + console.log( + `License ${license}: ${total} total pictures (${ + Math.floor((1000 * total) / totalLicensedImages) / 10 + }%), ${authors} authors (${ + Math.floor((1000 * authors) / totalAuthors) / 10 + }%), ${Math.floor(total / authors)} images/author` + ) + } + } + + async main(args: string[]): Promise { + const datapath = args[0] ?? "../MapComplete-data/ImageLicenseInfo" + await this.downloadData(datapath) + + // await this.downloadMetadata(datapath) + this.analyze(datapath) + } +} + +new GenerateImageAnalysis().run()