diff --git a/Docs/Reasonings/ImageLicenseAnalysis.md b/Docs/Reasonings/ImageLicenseAnalysis.md index ee2d26cae..b0c72a6d4 100644 --- a/Docs/Reasonings/ImageLicenseAnalysis.md +++ b/Docs/Reasonings/ImageLicenseAnalysis.md @@ -1,3 +1,5 @@ +(To rerun the analysis: use 'scripts/generateImageAnalysis'. Delete 'features_with_*.geojson' first to force updating the OSM-dataset) + # What licenses are used? Now that MapComplete is three-and-a-half year old, it's a good time to see what license people are using to upload their images. diff --git a/scripts/ScriptUtils.ts b/scripts/ScriptUtils.ts index 0e86c0e3d..617f6c153 100644 --- a/scripts/ScriptUtils.ts +++ b/scripts/ScriptUtils.ts @@ -1,9 +1,9 @@ import * as fs from "fs" -import { existsSync, lstatSync, readdirSync, readFileSync } from "fs" -import { Utils } from "../Utils" +import {existsSync, lstatSync, readdirSync, readFileSync} from "fs" +import {Utils} from "../Utils" import * as https from "https" -import { LayoutConfigJson } from "../Models/ThemeConfig/Json/LayoutConfigJson" -import { LayerConfigJson } from "../Models/ThemeConfig/Json/LayerConfigJson" +import {LayoutConfigJson} from "../Models/ThemeConfig/Json/LayoutConfigJson" +import {LayerConfigJson} from "../Models/ThemeConfig/Json/LayerConfigJson" import xml2js from "xml2js" export default class ScriptUtils { @@ -37,14 +37,16 @@ export default class ScriptUtils { return result } - public static DownloadFileTo(url, targetFilePath: string): void { - console.log("Downloading ", url, "to", targetFilePath) - https.get(url, (res) => { - const filePath = fs.createWriteStream(targetFilePath) - res.pipe(filePath) - filePath.on("finish", () => { - filePath.close() - console.log("Download Completed") + public static DownloadFileTo(url, targetFilePath: string): Promise { + ScriptUtils.erasableLog("Downloading", url, "to", targetFilePath) + return new Promise((resolve, err) => { + https.get(url, (res) => { + const filePath = fs.createWriteStream(targetFilePath) + res.pipe(filePath) + filePath.on("finish", () => { + filePath.close() + resolve() + }) }) }) } @@ -78,13 +80,13 @@ export default class ScriptUtils { .filter((path) => path.indexOf("license_info.json") < 0) .map((path) => { try { - const contents = readFileSync(path, { encoding: "utf8" }) + const contents = readFileSync(path, {encoding: "utf8"}) if (contents === "") { throw "The file " + path + " is empty, did you properly save?" } const parsed = JSON.parse(contents) - return { parsed, path } + return {parsed, path} } catch (e) { console.error("Could not parse file ", "./assets/layers/" + path, "due to ", e) throw e @@ -101,12 +103,12 @@ export default class ScriptUtils { public static getThemeFiles(): { parsed: LayoutConfigJson; path: string }[] { return this.getThemePaths().map((path) => { try { - const contents = readFileSync(path, { encoding: "utf8" }) + const contents = readFileSync(path, {encoding: "utf8"}) if (contents === "") { throw "The file " + path + " is empty, did you properly save?" } const parsed = JSON.parse(contents) - return { parsed: parsed, path: path } + return {parsed: parsed, path: path} } catch (e) { console.error("Could not read file ", path, "due to ", e) throw e @@ -125,14 +127,14 @@ export default class ScriptUtils { if (!existsSync(path)) { throw "File not found: " + path } - const root = await xml2js.parseStringPromise(readFileSync(path, { encoding: "utf8" })) + const root = await xml2js.parseStringPromise(readFileSync(path, {encoding: "utf8"})) return root.svg } public static ReadSvgSync(path: string, callback: (svg: any) => void): any { xml2js.parseString( - readFileSync(path, { encoding: "utf8" }), - { async: false }, + readFileSync(path, {encoding: "utf8"}), + {async: false}, (err, root) => { if (err) { throw err @@ -171,7 +173,7 @@ export default class ScriptUtils { }) res.addListener("end", function () { - resolve({ content: parts.join("") }) + resolve({content: parts.join("")}) }) } ) diff --git a/scripts/generateImageAnalysis.ts b/scripts/generateImageAnalysis.ts index 7a002f441..be6d83eef 100644 --- a/scripts/generateImageAnalysis.ts +++ b/scripts/generateImageAnalysis.ts @@ -1,14 +1,15 @@ import Script from "./Script" -import { Overpass } from "../Logic/Osm/Overpass" -import { RegexTag } from "../Logic/Tags/RegexTag" -import { ImmutableStore } from "../Logic/UIEventSource" -import { BBox } from "../Logic/BBox" +import {Overpass} from "../Logic/Osm/Overpass" +import {RegexTag} from "../Logic/Tags/RegexTag" +import {ImmutableStore} from "../Logic/UIEventSource" +import {BBox} from "../Logic/BBox" import * as fs from "fs" -import { Feature } from "geojson" +import {Feature} from "geojson" import ScriptUtils from "./ScriptUtils" -import { Imgur } from "../Logic/ImageProviders/Imgur" -import { LicenseInfo } from "../Logic/ImageProviders/LicenseInfo" -import { Utils } from "../Utils" +import {Imgur} from "../Logic/ImageProviders/Imgur" +import {LicenseInfo} from "../Logic/ImageProviders/LicenseInfo" +import {Utils} from "../Utils" +import Constants from "../Models/Constants"; export default class GenerateImageAnalysis extends Script { constructor() { @@ -17,9 +18,9 @@ export default class GenerateImageAnalysis extends Script { ) } - async fetchImages(key: string, datapath: string): Promise { + async fetchImages(key: string, datapath: string, refresh: boolean): Promise { const targetPath = `${datapath}/features_with_${key.replace(/[:\/]/, "_")}.geojson` - if (fs.existsSync(targetPath)) { + if (fs.existsSync(targetPath) && !refresh) { console.log("Skipping", key) return } @@ -27,27 +28,26 @@ export default class GenerateImageAnalysis extends Script { const overpass = new Overpass( tag, [], - "https://overpass.kumi.systems/api/interpreter", + Constants.defaultOverpassUrls[0], //"https://overpass.kumi.systems/api/interpreter", new ImmutableStore(500), undefined, false ) console.log("Starting query...") const data = await overpass.queryGeoJson(BBox.global) - console.log("Got data: ", data[0].features.length) + console.log("Got data:", data[0].features.length, "items; timestamp:", data[1].toISOString()) fs.writeFileSync(targetPath, JSON.stringify(data[0]), "utf8") console.log("Written", targetPath) } - async downloadData(datapath: string): Promise { + async downloadData(datapath: string, refresh: boolean): Promise { if (!fs.existsSync(datapath)) { fs.mkdirSync(datapath) } - - await this.fetchImages("image", datapath) - await this.fetchImages("image:streetsign", datapath) + await this.fetchImages("image", datapath, refresh) + await this.fetchImages("image:streetsign", datapath, refresh) for (let i = 0; i < 5; i++) { - await this.fetchImages("image:" + i, datapath) + await this.fetchImages("image:" + i, datapath, refresh) } } @@ -73,25 +73,55 @@ export default class GenerateImageAnalysis extends Script { if (!image.match(/https:\/\/i\.imgur\.com\/[a-zA-Z0-9]+\.jpg/)) { return false } - const targetPath = datapath + "/" + image.replace(/[\/:.\-%]/g, "_") + ".json" + const filename = image.replace(/[\/:.\-%]/g, "_") + ".json" + const targetPath = datapath + "/" + filename if (fs.existsSync(targetPath)) { return false } const attribution = await Imgur.singleton.DownloadAttribution(image) + + if ((attribution.artist ?? "") === "") { + // This is an invalid attribution. We save the raw response as well + const hash = image.substr("https://i.imgur.com/".length).split(".jpg")[0] + + const apiUrl = "https://api.imgur.com/3/image/" + hash + const response = await Utils.downloadJsonCached(apiUrl, 365 * 24 * 60 * 60, { + Authorization: "Client-ID " + Constants.ImgurApiKey, + }) + const rawTarget = datapath + "/raw/" + filename + console.log("Also storing the raw response to", rawTarget) + await fs.writeFileSync(rawTarget, JSON.stringify(response, null, " ")) + } + await fs.writeFileSync(targetPath, JSON.stringify(attribution, null, " ")) return true } - async downloadMetadata(datapath: string): Promise { - const features = this.loadData(datapath) + loadImageUrls(datapath: string): { allImages: Set, imageSource: Map } { let allImages = new Set() + const features = this.loadData(datapath) + let imageSource: Map = new Map() for (const feature of features) { allImages.add(feature.properties["image"]) + imageSource[feature.properties["image"]] = feature.properties.id + allImages.add(feature.properties["image:streetsign"]) + imageSource[feature.properties["image:streetsign"]] = feature.properties.id + " (streetsign)" + for (let i = 0; i < 10; i++) { allImages.add(feature.properties["image:" + i]) + imageSource[feature.properties["image:" + i]] = `${feature.properties.id} (image:${i})` } } + allImages.delete(undefined) + allImages.delete(null) + imageSource.delete(undefined) + imageSource.delete(null) + return {allImages, imageSource} + } + + async downloadMetadata(datapath: string): Promise { + const {allImages, imageSource} = this.loadImageUrls(datapath) console.log("Detected", allImages.size, "images") let i = 0 let d = 0 @@ -113,10 +143,9 @@ export default class GenerateImageAnalysis extends Script { } downloaded: ${d},skipped: ${s}, failed: ${f}, running: ${Math.floor( runningSecs )}sec, ETA: ${estimatedActualMinutes}:${estimatedActualSeconds % 60}` - ScriptUtils.erasableLog( - " ", - msg - ) + if (d + f % 1000 === 1 || downloaded) { + ScriptUtils.erasableLog(msg) + } if (downloaded) { d++ } else { @@ -124,10 +153,80 @@ export default class GenerateImageAnalysis extends Script { } if (d + f == 75000) { console.log("Used 75000 API calls, leaving 5000 for the rest of the day...") + break + } + } catch (e) { + // console.log(e) + console.log("Offending image hash is", image, "from https://openstreetmap.org/" + imageSource[image]) + f++ + } + } + } + + async downloadImage(url: string, imagePath: string): Promise { + const filenameLong = url.replace(/[\/:.\-%]/g, "_") + ".jpg" + const targetPathLong = imagePath + "/" + filenameLong + + const filename = url.substring("https://i.imgur.com/".length) + const targetPath = imagePath + "/" + filename + if (fs.existsSync(targetPathLong)) { + if (fs.existsSync(targetPath)) { + fs.unlinkSync(targetPathLong) + console.log("Unlinking duplicate") + return false + } + console.log("Renaming...") + fs.renameSync(targetPathLong, targetPath) + return false + } + if (fs.existsSync(targetPath)) { + return false + } + await ScriptUtils.DownloadFileTo(url, targetPath) + return true + } + + async downloadAllImages(datapath: string, imagePath: string): Promise { + const {allImages} = this.loadImageUrls(datapath) + let skipped = 0 + let failed = 0 + let downloaded = 0 + let invalid = 0 + const startTime = Date.now() + const urls = Array.from(allImages).filter(url => url.startsWith("https://i.imgur.com")) + for (const url of urls) { + const runningTime = ((Date.now()) - startTime) / 1000 + const handled = skipped + downloaded + failed + const itemsLeft = allImages.size - handled + const speed = handled / runningTime + const timeLeft = Math.round(itemsLeft * speed) + try { + const downloadedStatus = await Promise.all(url.split(";").map(url => + this.downloadImage(url.trim(), imagePath), + )) + + for (const b of downloadedStatus) { + if (b) { + downloaded += 1 + } else { + skipped += 1 + } + } + + if (downloadedStatus.some(i => i) || skipped % 10000 === 0) { + + console.log("Handled", url, JSON.stringify({ + skipped, + failed, + downloaded, + invalid, + total: allImages.size, + eta: timeLeft + "s" + })) } } catch (e) { console.log(e) - f++ + failed++ } } } @@ -141,7 +240,7 @@ export default class GenerateImageAnalysis extends Script { if (!file.endsWith(".json")) { continue } - const attr = JSON.parse(fs.readFileSync(file, { encoding: "utf8" })) + const attr = JSON.parse(fs.readFileSync(file, {encoding: "utf8"})) const license = attr.licenseShortName if (license === undefined || attr.artist === undefined) { @@ -220,7 +319,7 @@ export default class GenerateImageAnalysis extends Script { ...Array.from(licenseByAuthor.get("CC-BY-SA 4.0").values()), ] - console.log("Total number of correctly licenses pictures: ", totalLicensedImages) + console.log("Total number of correctly licenses pictures: ", totalLicensedImages, "(out of ", files.length, " images)") console.log("Total number of authors:", byAuthor.size) console.log( "Total number of authors which used a valid, non CC0 license at one point in time", @@ -230,10 +329,15 @@ export default class GenerateImageAnalysis extends Script { } async main(args: string[]): Promise { + console.log("Usage: [--cached] to use the cached osm data") + console.log("Args are", args) + const cached = args.indexOf("--cached") < 0 + args = args.filter(a => a !== "--cached") const datapath = args[0] ?? "../../git/MapComplete-data/ImageLicenseInfo" - await this.downloadData(datapath) + await this.downloadData(datapath, cached) await this.downloadMetadata(datapath) + await this.downloadAllImages(datapath, "/home/pietervdvn/data/imgur-image-backup") this.analyze(datapath) } }