Scripts: Update imageAnalysis script to also backup the images
This commit is contained in:
parent
e5cc7eec71
commit
088fbe1d07
3 changed files with 156 additions and 48 deletions
|
@ -1,3 +1,5 @@
|
|||
(To rerun the analysis: use 'scripts/generateImageAnalysis'. Delete 'features_with_*.geojson' first to force updating the OSM-dataset)
|
||||
|
||||
# What licenses are used?
|
||||
|
||||
Now that MapComplete is three-and-a-half year old, it's a good time to see what license people are using to upload their images.
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
import * as fs from "fs"
|
||||
import { existsSync, lstatSync, readdirSync, readFileSync } from "fs"
|
||||
import { Utils } from "../Utils"
|
||||
import {existsSync, lstatSync, readdirSync, readFileSync} from "fs"
|
||||
import {Utils} from "../Utils"
|
||||
import * as https from "https"
|
||||
import { LayoutConfigJson } from "../Models/ThemeConfig/Json/LayoutConfigJson"
|
||||
import { LayerConfigJson } from "../Models/ThemeConfig/Json/LayerConfigJson"
|
||||
import {LayoutConfigJson} from "../Models/ThemeConfig/Json/LayoutConfigJson"
|
||||
import {LayerConfigJson} from "../Models/ThemeConfig/Json/LayerConfigJson"
|
||||
import xml2js from "xml2js"
|
||||
|
||||
export default class ScriptUtils {
|
||||
|
@ -37,14 +37,16 @@ export default class ScriptUtils {
|
|||
return result
|
||||
}
|
||||
|
||||
public static DownloadFileTo(url, targetFilePath: string): void {
|
||||
console.log("Downloading ", url, "to", targetFilePath)
|
||||
https.get(url, (res) => {
|
||||
const filePath = fs.createWriteStream(targetFilePath)
|
||||
res.pipe(filePath)
|
||||
filePath.on("finish", () => {
|
||||
filePath.close()
|
||||
console.log("Download Completed")
|
||||
public static DownloadFileTo(url, targetFilePath: string): Promise<void> {
|
||||
ScriptUtils.erasableLog("Downloading", url, "to", targetFilePath)
|
||||
return new Promise<void>((resolve, err) => {
|
||||
https.get(url, (res) => {
|
||||
const filePath = fs.createWriteStream(targetFilePath)
|
||||
res.pipe(filePath)
|
||||
filePath.on("finish", () => {
|
||||
filePath.close()
|
||||
resolve()
|
||||
})
|
||||
})
|
||||
})
|
||||
}
|
||||
|
@ -78,13 +80,13 @@ export default class ScriptUtils {
|
|||
.filter((path) => path.indexOf("license_info.json") < 0)
|
||||
.map((path) => {
|
||||
try {
|
||||
const contents = readFileSync(path, { encoding: "utf8" })
|
||||
const contents = readFileSync(path, {encoding: "utf8"})
|
||||
if (contents === "") {
|
||||
throw "The file " + path + " is empty, did you properly save?"
|
||||
}
|
||||
|
||||
const parsed = JSON.parse(contents)
|
||||
return { parsed, path }
|
||||
return {parsed, path}
|
||||
} catch (e) {
|
||||
console.error("Could not parse file ", "./assets/layers/" + path, "due to ", e)
|
||||
throw e
|
||||
|
@ -101,12 +103,12 @@ export default class ScriptUtils {
|
|||
public static getThemeFiles(): { parsed: LayoutConfigJson; path: string }[] {
|
||||
return this.getThemePaths().map((path) => {
|
||||
try {
|
||||
const contents = readFileSync(path, { encoding: "utf8" })
|
||||
const contents = readFileSync(path, {encoding: "utf8"})
|
||||
if (contents === "") {
|
||||
throw "The file " + path + " is empty, did you properly save?"
|
||||
}
|
||||
const parsed = JSON.parse(contents)
|
||||
return { parsed: parsed, path: path }
|
||||
return {parsed: parsed, path: path}
|
||||
} catch (e) {
|
||||
console.error("Could not read file ", path, "due to ", e)
|
||||
throw e
|
||||
|
@ -125,14 +127,14 @@ export default class ScriptUtils {
|
|||
if (!existsSync(path)) {
|
||||
throw "File not found: " + path
|
||||
}
|
||||
const root = await xml2js.parseStringPromise(readFileSync(path, { encoding: "utf8" }))
|
||||
const root = await xml2js.parseStringPromise(readFileSync(path, {encoding: "utf8"}))
|
||||
return root.svg
|
||||
}
|
||||
|
||||
public static ReadSvgSync(path: string, callback: (svg: any) => void): any {
|
||||
xml2js.parseString(
|
||||
readFileSync(path, { encoding: "utf8" }),
|
||||
{ async: false },
|
||||
readFileSync(path, {encoding: "utf8"}),
|
||||
{async: false},
|
||||
(err, root) => {
|
||||
if (err) {
|
||||
throw err
|
||||
|
@ -171,7 +173,7 @@ export default class ScriptUtils {
|
|||
})
|
||||
|
||||
res.addListener("end", function () {
|
||||
resolve({ content: parts.join("") })
|
||||
resolve({content: parts.join("")})
|
||||
})
|
||||
}
|
||||
)
|
||||
|
|
|
@ -1,14 +1,15 @@
|
|||
import Script from "./Script"
|
||||
import { Overpass } from "../Logic/Osm/Overpass"
|
||||
import { RegexTag } from "../Logic/Tags/RegexTag"
|
||||
import { ImmutableStore } from "../Logic/UIEventSource"
|
||||
import { BBox } from "../Logic/BBox"
|
||||
import {Overpass} from "../Logic/Osm/Overpass"
|
||||
import {RegexTag} from "../Logic/Tags/RegexTag"
|
||||
import {ImmutableStore} from "../Logic/UIEventSource"
|
||||
import {BBox} from "../Logic/BBox"
|
||||
import * as fs from "fs"
|
||||
import { Feature } from "geojson"
|
||||
import {Feature} from "geojson"
|
||||
import ScriptUtils from "./ScriptUtils"
|
||||
import { Imgur } from "../Logic/ImageProviders/Imgur"
|
||||
import { LicenseInfo } from "../Logic/ImageProviders/LicenseInfo"
|
||||
import { Utils } from "../Utils"
|
||||
import {Imgur} from "../Logic/ImageProviders/Imgur"
|
||||
import {LicenseInfo} from "../Logic/ImageProviders/LicenseInfo"
|
||||
import {Utils} from "../Utils"
|
||||
import Constants from "../Models/Constants";
|
||||
|
||||
export default class GenerateImageAnalysis extends Script {
|
||||
constructor() {
|
||||
|
@ -17,9 +18,9 @@ export default class GenerateImageAnalysis extends Script {
|
|||
)
|
||||
}
|
||||
|
||||
async fetchImages(key: string, datapath: string): Promise<void> {
|
||||
async fetchImages(key: string, datapath: string, refresh: boolean): Promise<void> {
|
||||
const targetPath = `${datapath}/features_with_${key.replace(/[:\/]/, "_")}.geojson`
|
||||
if (fs.existsSync(targetPath)) {
|
||||
if (fs.existsSync(targetPath) && !refresh) {
|
||||
console.log("Skipping", key)
|
||||
return
|
||||
}
|
||||
|
@ -27,27 +28,26 @@ export default class GenerateImageAnalysis extends Script {
|
|||
const overpass = new Overpass(
|
||||
tag,
|
||||
[],
|
||||
"https://overpass.kumi.systems/api/interpreter",
|
||||
Constants.defaultOverpassUrls[0], //"https://overpass.kumi.systems/api/interpreter",
|
||||
new ImmutableStore(500),
|
||||
undefined,
|
||||
false
|
||||
)
|
||||
console.log("Starting query...")
|
||||
const data = await overpass.queryGeoJson(BBox.global)
|
||||
console.log("Got data: ", data[0].features.length)
|
||||
console.log("Got data:", data[0].features.length, "items; timestamp:", data[1].toISOString())
|
||||
fs.writeFileSync(targetPath, JSON.stringify(data[0]), "utf8")
|
||||
console.log("Written", targetPath)
|
||||
}
|
||||
|
||||
async downloadData(datapath: string): Promise<void> {
|
||||
async downloadData(datapath: string, refresh: boolean): Promise<void> {
|
||||
if (!fs.existsSync(datapath)) {
|
||||
fs.mkdirSync(datapath)
|
||||
}
|
||||
|
||||
await this.fetchImages("image", datapath)
|
||||
await this.fetchImages("image:streetsign", datapath)
|
||||
await this.fetchImages("image", datapath, refresh)
|
||||
await this.fetchImages("image:streetsign", datapath, refresh)
|
||||
for (let i = 0; i < 5; i++) {
|
||||
await this.fetchImages("image:" + i, datapath)
|
||||
await this.fetchImages("image:" + i, datapath, refresh)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -73,25 +73,55 @@ export default class GenerateImageAnalysis extends Script {
|
|||
if (!image.match(/https:\/\/i\.imgur\.com\/[a-zA-Z0-9]+\.jpg/)) {
|
||||
return false
|
||||
}
|
||||
const targetPath = datapath + "/" + image.replace(/[\/:.\-%]/g, "_") + ".json"
|
||||
const filename = image.replace(/[\/:.\-%]/g, "_") + ".json"
|
||||
const targetPath = datapath + "/" + filename
|
||||
if (fs.existsSync(targetPath)) {
|
||||
return false
|
||||
}
|
||||
const attribution = await Imgur.singleton.DownloadAttribution(image)
|
||||
|
||||
if ((attribution.artist ?? "") === "") {
|
||||
// This is an invalid attribution. We save the raw response as well
|
||||
const hash = image.substr("https://i.imgur.com/".length).split(".jpg")[0]
|
||||
|
||||
const apiUrl = "https://api.imgur.com/3/image/" + hash
|
||||
const response = await Utils.downloadJsonCached(apiUrl, 365 * 24 * 60 * 60, {
|
||||
Authorization: "Client-ID " + Constants.ImgurApiKey,
|
||||
})
|
||||
const rawTarget = datapath + "/raw/" + filename
|
||||
console.log("Also storing the raw response to", rawTarget)
|
||||
await fs.writeFileSync(rawTarget, JSON.stringify(response, null, " "))
|
||||
}
|
||||
|
||||
await fs.writeFileSync(targetPath, JSON.stringify(attribution, null, " "))
|
||||
return true
|
||||
}
|
||||
|
||||
async downloadMetadata(datapath: string): Promise<void> {
|
||||
const features = this.loadData(datapath)
|
||||
loadImageUrls(datapath: string): { allImages: Set<string>, imageSource: Map<string, string> } {
|
||||
let allImages = new Set<string>()
|
||||
const features = this.loadData(datapath)
|
||||
let imageSource: Map<string, string> = new Map<string, string>()
|
||||
|
||||
for (const feature of features) {
|
||||
allImages.add(feature.properties["image"])
|
||||
imageSource[feature.properties["image"]] = feature.properties.id
|
||||
allImages.add(feature.properties["image:streetsign"])
|
||||
imageSource[feature.properties["image:streetsign"]] = feature.properties.id + " (streetsign)"
|
||||
|
||||
for (let i = 0; i < 10; i++) {
|
||||
allImages.add(feature.properties["image:" + i])
|
||||
imageSource[feature.properties["image:" + i]] = `${feature.properties.id} (image:${i})`
|
||||
}
|
||||
}
|
||||
allImages.delete(undefined)
|
||||
allImages.delete(null)
|
||||
imageSource.delete(undefined)
|
||||
imageSource.delete(null)
|
||||
return {allImages, imageSource}
|
||||
}
|
||||
|
||||
async downloadMetadata(datapath: string): Promise<void> {
|
||||
const {allImages, imageSource} = this.loadImageUrls(datapath)
|
||||
console.log("Detected", allImages.size, "images")
|
||||
let i = 0
|
||||
let d = 0
|
||||
|
@ -113,10 +143,9 @@ export default class GenerateImageAnalysis extends Script {
|
|||
} downloaded: ${d},skipped: ${s}, failed: ${f}, running: ${Math.floor(
|
||||
runningSecs
|
||||
)}sec, ETA: ${estimatedActualMinutes}:${estimatedActualSeconds % 60}`
|
||||
ScriptUtils.erasableLog(
|
||||
" ",
|
||||
msg
|
||||
)
|
||||
if (d + f % 1000 === 1 || downloaded) {
|
||||
ScriptUtils.erasableLog(msg)
|
||||
}
|
||||
if (downloaded) {
|
||||
d++
|
||||
} else {
|
||||
|
@ -124,10 +153,80 @@ export default class GenerateImageAnalysis extends Script {
|
|||
}
|
||||
if (d + f == 75000) {
|
||||
console.log("Used 75000 API calls, leaving 5000 for the rest of the day...")
|
||||
break
|
||||
}
|
||||
} catch (e) {
|
||||
// console.log(e)
|
||||
console.log("Offending image hash is", image, "from https://openstreetmap.org/" + imageSource[image])
|
||||
f++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async downloadImage(url: string, imagePath: string): Promise<boolean> {
|
||||
const filenameLong = url.replace(/[\/:.\-%]/g, "_") + ".jpg"
|
||||
const targetPathLong = imagePath + "/" + filenameLong
|
||||
|
||||
const filename = url.substring("https://i.imgur.com/".length)
|
||||
const targetPath = imagePath + "/" + filename
|
||||
if (fs.existsSync(targetPathLong)) {
|
||||
if (fs.existsSync(targetPath)) {
|
||||
fs.unlinkSync(targetPathLong)
|
||||
console.log("Unlinking duplicate")
|
||||
return false
|
||||
}
|
||||
console.log("Renaming...")
|
||||
fs.renameSync(targetPathLong, targetPath)
|
||||
return false
|
||||
}
|
||||
if (fs.existsSync(targetPath)) {
|
||||
return false
|
||||
}
|
||||
await ScriptUtils.DownloadFileTo(url, targetPath)
|
||||
return true
|
||||
}
|
||||
|
||||
async downloadAllImages(datapath: string, imagePath: string): Promise<void> {
|
||||
const {allImages} = this.loadImageUrls(datapath)
|
||||
let skipped = 0
|
||||
let failed = 0
|
||||
let downloaded = 0
|
||||
let invalid = 0
|
||||
const startTime = Date.now()
|
||||
const urls = Array.from(allImages).filter(url => url.startsWith("https://i.imgur.com"))
|
||||
for (const url of urls) {
|
||||
const runningTime = ((Date.now()) - startTime) / 1000
|
||||
const handled = skipped + downloaded + failed
|
||||
const itemsLeft = allImages.size - handled
|
||||
const speed = handled / runningTime
|
||||
const timeLeft = Math.round(itemsLeft * speed)
|
||||
try {
|
||||
const downloadedStatus = await Promise.all(url.split(";").map(url =>
|
||||
this.downloadImage(url.trim(), imagePath),
|
||||
))
|
||||
|
||||
for (const b of downloadedStatus) {
|
||||
if (b) {
|
||||
downloaded += 1
|
||||
} else {
|
||||
skipped += 1
|
||||
}
|
||||
}
|
||||
|
||||
if (downloadedStatus.some(i => i) || skipped % 10000 === 0) {
|
||||
|
||||
console.log("Handled", url, JSON.stringify({
|
||||
skipped,
|
||||
failed,
|
||||
downloaded,
|
||||
invalid,
|
||||
total: allImages.size,
|
||||
eta: timeLeft + "s"
|
||||
}))
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
f++
|
||||
failed++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -141,7 +240,7 @@ export default class GenerateImageAnalysis extends Script {
|
|||
if (!file.endsWith(".json")) {
|
||||
continue
|
||||
}
|
||||
const attr = <LicenseInfo>JSON.parse(fs.readFileSync(file, { encoding: "utf8" }))
|
||||
const attr = <LicenseInfo>JSON.parse(fs.readFileSync(file, {encoding: "utf8"}))
|
||||
const license = attr.licenseShortName
|
||||
|
||||
if (license === undefined || attr.artist === undefined) {
|
||||
|
@ -220,7 +319,7 @@ export default class GenerateImageAnalysis extends Script {
|
|||
...Array.from(licenseByAuthor.get("CC-BY-SA 4.0").values()),
|
||||
]
|
||||
|
||||
console.log("Total number of correctly licenses pictures: ", totalLicensedImages)
|
||||
console.log("Total number of correctly licenses pictures: ", totalLicensedImages, "(out of ", files.length, " images)")
|
||||
console.log("Total number of authors:", byAuthor.size)
|
||||
console.log(
|
||||
"Total number of authors which used a valid, non CC0 license at one point in time",
|
||||
|
@ -230,10 +329,15 @@ export default class GenerateImageAnalysis extends Script {
|
|||
}
|
||||
|
||||
async main(args: string[]): Promise<void> {
|
||||
console.log("Usage: [--cached] to use the cached osm data")
|
||||
console.log("Args are", args)
|
||||
const cached = args.indexOf("--cached") < 0
|
||||
args = args.filter(a => a !== "--cached")
|
||||
const datapath = args[0] ?? "../../git/MapComplete-data/ImageLicenseInfo"
|
||||
await this.downloadData(datapath)
|
||||
await this.downloadData(datapath, cached)
|
||||
|
||||
await this.downloadMetadata(datapath)
|
||||
await this.downloadAllImages(datapath, "/home/pietervdvn/data/imgur-image-backup")
|
||||
this.analyze(datapath)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue