Scripts: Update imageAnalysis script to also backup the images

This commit is contained in:
Pieter Vander Vennet 2023-05-18 13:07:14 +02:00
parent e5cc7eec71
commit 088fbe1d07
3 changed files with 156 additions and 48 deletions

View file

@ -1,3 +1,5 @@
(To rerun the analysis: use 'scripts/generateImageAnalysis'. Delete 'features_with_*.geojson' first to force updating the OSM-dataset)
# What licenses are used?
Now that MapComplete is three-and-a-half year old, it's a good time to see what license people are using to upload their images.

View file

@ -1,9 +1,9 @@
import * as fs from "fs"
import { existsSync, lstatSync, readdirSync, readFileSync } from "fs"
import { Utils } from "../Utils"
import {existsSync, lstatSync, readdirSync, readFileSync} from "fs"
import {Utils} from "../Utils"
import * as https from "https"
import { LayoutConfigJson } from "../Models/ThemeConfig/Json/LayoutConfigJson"
import { LayerConfigJson } from "../Models/ThemeConfig/Json/LayerConfigJson"
import {LayoutConfigJson} from "../Models/ThemeConfig/Json/LayoutConfigJson"
import {LayerConfigJson} from "../Models/ThemeConfig/Json/LayerConfigJson"
import xml2js from "xml2js"
export default class ScriptUtils {
@ -37,14 +37,16 @@ export default class ScriptUtils {
return result
}
public static DownloadFileTo(url, targetFilePath: string): void {
console.log("Downloading ", url, "to", targetFilePath)
https.get(url, (res) => {
const filePath = fs.createWriteStream(targetFilePath)
res.pipe(filePath)
filePath.on("finish", () => {
filePath.close()
console.log("Download Completed")
public static DownloadFileTo(url, targetFilePath: string): Promise<void> {
ScriptUtils.erasableLog("Downloading", url, "to", targetFilePath)
return new Promise<void>((resolve, err) => {
https.get(url, (res) => {
const filePath = fs.createWriteStream(targetFilePath)
res.pipe(filePath)
filePath.on("finish", () => {
filePath.close()
resolve()
})
})
})
}
@ -78,13 +80,13 @@ export default class ScriptUtils {
.filter((path) => path.indexOf("license_info.json") < 0)
.map((path) => {
try {
const contents = readFileSync(path, { encoding: "utf8" })
const contents = readFileSync(path, {encoding: "utf8"})
if (contents === "") {
throw "The file " + path + " is empty, did you properly save?"
}
const parsed = JSON.parse(contents)
return { parsed, path }
return {parsed, path}
} catch (e) {
console.error("Could not parse file ", "./assets/layers/" + path, "due to ", e)
throw e
@ -101,12 +103,12 @@ export default class ScriptUtils {
public static getThemeFiles(): { parsed: LayoutConfigJson; path: string }[] {
return this.getThemePaths().map((path) => {
try {
const contents = readFileSync(path, { encoding: "utf8" })
const contents = readFileSync(path, {encoding: "utf8"})
if (contents === "") {
throw "The file " + path + " is empty, did you properly save?"
}
const parsed = JSON.parse(contents)
return { parsed: parsed, path: path }
return {parsed: parsed, path: path}
} catch (e) {
console.error("Could not read file ", path, "due to ", e)
throw e
@ -125,14 +127,14 @@ export default class ScriptUtils {
if (!existsSync(path)) {
throw "File not found: " + path
}
const root = await xml2js.parseStringPromise(readFileSync(path, { encoding: "utf8" }))
const root = await xml2js.parseStringPromise(readFileSync(path, {encoding: "utf8"}))
return root.svg
}
public static ReadSvgSync(path: string, callback: (svg: any) => void): any {
xml2js.parseString(
readFileSync(path, { encoding: "utf8" }),
{ async: false },
readFileSync(path, {encoding: "utf8"}),
{async: false},
(err, root) => {
if (err) {
throw err
@ -171,7 +173,7 @@ export default class ScriptUtils {
})
res.addListener("end", function () {
resolve({ content: parts.join("") })
resolve({content: parts.join("")})
})
}
)

View file

@ -1,14 +1,15 @@
import Script from "./Script"
import { Overpass } from "../Logic/Osm/Overpass"
import { RegexTag } from "../Logic/Tags/RegexTag"
import { ImmutableStore } from "../Logic/UIEventSource"
import { BBox } from "../Logic/BBox"
import {Overpass} from "../Logic/Osm/Overpass"
import {RegexTag} from "../Logic/Tags/RegexTag"
import {ImmutableStore} from "../Logic/UIEventSource"
import {BBox} from "../Logic/BBox"
import * as fs from "fs"
import { Feature } from "geojson"
import {Feature} from "geojson"
import ScriptUtils from "./ScriptUtils"
import { Imgur } from "../Logic/ImageProviders/Imgur"
import { LicenseInfo } from "../Logic/ImageProviders/LicenseInfo"
import { Utils } from "../Utils"
import {Imgur} from "../Logic/ImageProviders/Imgur"
import {LicenseInfo} from "../Logic/ImageProviders/LicenseInfo"
import {Utils} from "../Utils"
import Constants from "../Models/Constants";
export default class GenerateImageAnalysis extends Script {
constructor() {
@ -17,9 +18,9 @@ export default class GenerateImageAnalysis extends Script {
)
}
async fetchImages(key: string, datapath: string): Promise<void> {
async fetchImages(key: string, datapath: string, refresh: boolean): Promise<void> {
const targetPath = `${datapath}/features_with_${key.replace(/[:\/]/, "_")}.geojson`
if (fs.existsSync(targetPath)) {
if (fs.existsSync(targetPath) && !refresh) {
console.log("Skipping", key)
return
}
@ -27,27 +28,26 @@ export default class GenerateImageAnalysis extends Script {
const overpass = new Overpass(
tag,
[],
"https://overpass.kumi.systems/api/interpreter",
Constants.defaultOverpassUrls[0], //"https://overpass.kumi.systems/api/interpreter",
new ImmutableStore(500),
undefined,
false
)
console.log("Starting query...")
const data = await overpass.queryGeoJson(BBox.global)
console.log("Got data: ", data[0].features.length)
console.log("Got data:", data[0].features.length, "items; timestamp:", data[1].toISOString())
fs.writeFileSync(targetPath, JSON.stringify(data[0]), "utf8")
console.log("Written", targetPath)
}
async downloadData(datapath: string): Promise<void> {
async downloadData(datapath: string, refresh: boolean): Promise<void> {
if (!fs.existsSync(datapath)) {
fs.mkdirSync(datapath)
}
await this.fetchImages("image", datapath)
await this.fetchImages("image:streetsign", datapath)
await this.fetchImages("image", datapath, refresh)
await this.fetchImages("image:streetsign", datapath, refresh)
for (let i = 0; i < 5; i++) {
await this.fetchImages("image:" + i, datapath)
await this.fetchImages("image:" + i, datapath, refresh)
}
}
@ -73,25 +73,55 @@ export default class GenerateImageAnalysis extends Script {
if (!image.match(/https:\/\/i\.imgur\.com\/[a-zA-Z0-9]+\.jpg/)) {
return false
}
const targetPath = datapath + "/" + image.replace(/[\/:.\-%]/g, "_") + ".json"
const filename = image.replace(/[\/:.\-%]/g, "_") + ".json"
const targetPath = datapath + "/" + filename
if (fs.existsSync(targetPath)) {
return false
}
const attribution = await Imgur.singleton.DownloadAttribution(image)
if ((attribution.artist ?? "") === "") {
// This is an invalid attribution. We save the raw response as well
const hash = image.substr("https://i.imgur.com/".length).split(".jpg")[0]
const apiUrl = "https://api.imgur.com/3/image/" + hash
const response = await Utils.downloadJsonCached(apiUrl, 365 * 24 * 60 * 60, {
Authorization: "Client-ID " + Constants.ImgurApiKey,
})
const rawTarget = datapath + "/raw/" + filename
console.log("Also storing the raw response to", rawTarget)
await fs.writeFileSync(rawTarget, JSON.stringify(response, null, " "))
}
await fs.writeFileSync(targetPath, JSON.stringify(attribution, null, " "))
return true
}
async downloadMetadata(datapath: string): Promise<void> {
const features = this.loadData(datapath)
loadImageUrls(datapath: string): { allImages: Set<string>, imageSource: Map<string, string> } {
let allImages = new Set<string>()
const features = this.loadData(datapath)
let imageSource: Map<string, string> = new Map<string, string>()
for (const feature of features) {
allImages.add(feature.properties["image"])
imageSource[feature.properties["image"]] = feature.properties.id
allImages.add(feature.properties["image:streetsign"])
imageSource[feature.properties["image:streetsign"]] = feature.properties.id + " (streetsign)"
for (let i = 0; i < 10; i++) {
allImages.add(feature.properties["image:" + i])
imageSource[feature.properties["image:" + i]] = `${feature.properties.id} (image:${i})`
}
}
allImages.delete(undefined)
allImages.delete(null)
imageSource.delete(undefined)
imageSource.delete(null)
return {allImages, imageSource}
}
async downloadMetadata(datapath: string): Promise<void> {
const {allImages, imageSource} = this.loadImageUrls(datapath)
console.log("Detected", allImages.size, "images")
let i = 0
let d = 0
@ -113,10 +143,9 @@ export default class GenerateImageAnalysis extends Script {
} downloaded: ${d},skipped: ${s}, failed: ${f}, running: ${Math.floor(
runningSecs
)}sec, ETA: ${estimatedActualMinutes}:${estimatedActualSeconds % 60}`
ScriptUtils.erasableLog(
" ",
msg
)
if (d + f % 1000 === 1 || downloaded) {
ScriptUtils.erasableLog(msg)
}
if (downloaded) {
d++
} else {
@ -124,10 +153,80 @@ export default class GenerateImageAnalysis extends Script {
}
if (d + f == 75000) {
console.log("Used 75000 API calls, leaving 5000 for the rest of the day...")
break
}
} catch (e) {
// console.log(e)
console.log("Offending image hash is", image, "from https://openstreetmap.org/" + imageSource[image])
f++
}
}
}
async downloadImage(url: string, imagePath: string): Promise<boolean> {
const filenameLong = url.replace(/[\/:.\-%]/g, "_") + ".jpg"
const targetPathLong = imagePath + "/" + filenameLong
const filename = url.substring("https://i.imgur.com/".length)
const targetPath = imagePath + "/" + filename
if (fs.existsSync(targetPathLong)) {
if (fs.existsSync(targetPath)) {
fs.unlinkSync(targetPathLong)
console.log("Unlinking duplicate")
return false
}
console.log("Renaming...")
fs.renameSync(targetPathLong, targetPath)
return false
}
if (fs.existsSync(targetPath)) {
return false
}
await ScriptUtils.DownloadFileTo(url, targetPath)
return true
}
async downloadAllImages(datapath: string, imagePath: string): Promise<void> {
const {allImages} = this.loadImageUrls(datapath)
let skipped = 0
let failed = 0
let downloaded = 0
let invalid = 0
const startTime = Date.now()
const urls = Array.from(allImages).filter(url => url.startsWith("https://i.imgur.com"))
for (const url of urls) {
const runningTime = ((Date.now()) - startTime) / 1000
const handled = skipped + downloaded + failed
const itemsLeft = allImages.size - handled
const speed = handled / runningTime
const timeLeft = Math.round(itemsLeft * speed)
try {
const downloadedStatus = await Promise.all(url.split(";").map(url =>
this.downloadImage(url.trim(), imagePath),
))
for (const b of downloadedStatus) {
if (b) {
downloaded += 1
} else {
skipped += 1
}
}
if (downloadedStatus.some(i => i) || skipped % 10000 === 0) {
console.log("Handled", url, JSON.stringify({
skipped,
failed,
downloaded,
invalid,
total: allImages.size,
eta: timeLeft + "s"
}))
}
} catch (e) {
console.log(e)
f++
failed++
}
}
}
@ -141,7 +240,7 @@ export default class GenerateImageAnalysis extends Script {
if (!file.endsWith(".json")) {
continue
}
const attr = <LicenseInfo>JSON.parse(fs.readFileSync(file, { encoding: "utf8" }))
const attr = <LicenseInfo>JSON.parse(fs.readFileSync(file, {encoding: "utf8"}))
const license = attr.licenseShortName
if (license === undefined || attr.artist === undefined) {
@ -220,7 +319,7 @@ export default class GenerateImageAnalysis extends Script {
...Array.from(licenseByAuthor.get("CC-BY-SA 4.0").values()),
]
console.log("Total number of correctly licenses pictures: ", totalLicensedImages)
console.log("Total number of correctly licenses pictures: ", totalLicensedImages, "(out of ", files.length, " images)")
console.log("Total number of authors:", byAuthor.size)
console.log(
"Total number of authors which used a valid, non CC0 license at one point in time",
@ -230,10 +329,15 @@ export default class GenerateImageAnalysis extends Script {
}
async main(args: string[]): Promise<void> {
console.log("Usage: [--cached] to use the cached osm data")
console.log("Args are", args)
const cached = args.indexOf("--cached") < 0
args = args.filter(a => a !== "--cached")
const datapath = args[0] ?? "../../git/MapComplete-data/ImageLicenseInfo"
await this.downloadData(datapath)
await this.downloadData(datapath, cached)
await this.downloadMetadata(datapath)
await this.downloadAllImages(datapath, "/home/pietervdvn/data/imgur-image-backup")
this.analyze(datapath)
}
}