Scripts: Update imageAnalysis script to also backup the images
This commit is contained in:
parent
e5cc7eec71
commit
088fbe1d07
3 changed files with 156 additions and 48 deletions
|
@ -1,3 +1,5 @@
|
||||||
|
(To rerun the analysis: use 'scripts/generateImageAnalysis'. Delete 'features_with_*.geojson' first to force updating the OSM-dataset)
|
||||||
|
|
||||||
# What licenses are used?
|
# What licenses are used?
|
||||||
|
|
||||||
Now that MapComplete is three-and-a-half year old, it's a good time to see what license people are using to upload their images.
|
Now that MapComplete is three-and-a-half year old, it's a good time to see what license people are using to upload their images.
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
import * as fs from "fs"
|
import * as fs from "fs"
|
||||||
import { existsSync, lstatSync, readdirSync, readFileSync } from "fs"
|
import {existsSync, lstatSync, readdirSync, readFileSync} from "fs"
|
||||||
import { Utils } from "../Utils"
|
import {Utils} from "../Utils"
|
||||||
import * as https from "https"
|
import * as https from "https"
|
||||||
import { LayoutConfigJson } from "../Models/ThemeConfig/Json/LayoutConfigJson"
|
import {LayoutConfigJson} from "../Models/ThemeConfig/Json/LayoutConfigJson"
|
||||||
import { LayerConfigJson } from "../Models/ThemeConfig/Json/LayerConfigJson"
|
import {LayerConfigJson} from "../Models/ThemeConfig/Json/LayerConfigJson"
|
||||||
import xml2js from "xml2js"
|
import xml2js from "xml2js"
|
||||||
|
|
||||||
export default class ScriptUtils {
|
export default class ScriptUtils {
|
||||||
|
@ -37,14 +37,16 @@ export default class ScriptUtils {
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
public static DownloadFileTo(url, targetFilePath: string): void {
|
public static DownloadFileTo(url, targetFilePath: string): Promise<void> {
|
||||||
console.log("Downloading ", url, "to", targetFilePath)
|
ScriptUtils.erasableLog("Downloading", url, "to", targetFilePath)
|
||||||
https.get(url, (res) => {
|
return new Promise<void>((resolve, err) => {
|
||||||
const filePath = fs.createWriteStream(targetFilePath)
|
https.get(url, (res) => {
|
||||||
res.pipe(filePath)
|
const filePath = fs.createWriteStream(targetFilePath)
|
||||||
filePath.on("finish", () => {
|
res.pipe(filePath)
|
||||||
filePath.close()
|
filePath.on("finish", () => {
|
||||||
console.log("Download Completed")
|
filePath.close()
|
||||||
|
resolve()
|
||||||
|
})
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -78,13 +80,13 @@ export default class ScriptUtils {
|
||||||
.filter((path) => path.indexOf("license_info.json") < 0)
|
.filter((path) => path.indexOf("license_info.json") < 0)
|
||||||
.map((path) => {
|
.map((path) => {
|
||||||
try {
|
try {
|
||||||
const contents = readFileSync(path, { encoding: "utf8" })
|
const contents = readFileSync(path, {encoding: "utf8"})
|
||||||
if (contents === "") {
|
if (contents === "") {
|
||||||
throw "The file " + path + " is empty, did you properly save?"
|
throw "The file " + path + " is empty, did you properly save?"
|
||||||
}
|
}
|
||||||
|
|
||||||
const parsed = JSON.parse(contents)
|
const parsed = JSON.parse(contents)
|
||||||
return { parsed, path }
|
return {parsed, path}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error("Could not parse file ", "./assets/layers/" + path, "due to ", e)
|
console.error("Could not parse file ", "./assets/layers/" + path, "due to ", e)
|
||||||
throw e
|
throw e
|
||||||
|
@ -101,12 +103,12 @@ export default class ScriptUtils {
|
||||||
public static getThemeFiles(): { parsed: LayoutConfigJson; path: string }[] {
|
public static getThemeFiles(): { parsed: LayoutConfigJson; path: string }[] {
|
||||||
return this.getThemePaths().map((path) => {
|
return this.getThemePaths().map((path) => {
|
||||||
try {
|
try {
|
||||||
const contents = readFileSync(path, { encoding: "utf8" })
|
const contents = readFileSync(path, {encoding: "utf8"})
|
||||||
if (contents === "") {
|
if (contents === "") {
|
||||||
throw "The file " + path + " is empty, did you properly save?"
|
throw "The file " + path + " is empty, did you properly save?"
|
||||||
}
|
}
|
||||||
const parsed = JSON.parse(contents)
|
const parsed = JSON.parse(contents)
|
||||||
return { parsed: parsed, path: path }
|
return {parsed: parsed, path: path}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error("Could not read file ", path, "due to ", e)
|
console.error("Could not read file ", path, "due to ", e)
|
||||||
throw e
|
throw e
|
||||||
|
@ -125,14 +127,14 @@ export default class ScriptUtils {
|
||||||
if (!existsSync(path)) {
|
if (!existsSync(path)) {
|
||||||
throw "File not found: " + path
|
throw "File not found: " + path
|
||||||
}
|
}
|
||||||
const root = await xml2js.parseStringPromise(readFileSync(path, { encoding: "utf8" }))
|
const root = await xml2js.parseStringPromise(readFileSync(path, {encoding: "utf8"}))
|
||||||
return root.svg
|
return root.svg
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ReadSvgSync(path: string, callback: (svg: any) => void): any {
|
public static ReadSvgSync(path: string, callback: (svg: any) => void): any {
|
||||||
xml2js.parseString(
|
xml2js.parseString(
|
||||||
readFileSync(path, { encoding: "utf8" }),
|
readFileSync(path, {encoding: "utf8"}),
|
||||||
{ async: false },
|
{async: false},
|
||||||
(err, root) => {
|
(err, root) => {
|
||||||
if (err) {
|
if (err) {
|
||||||
throw err
|
throw err
|
||||||
|
@ -171,7 +173,7 @@ export default class ScriptUtils {
|
||||||
})
|
})
|
||||||
|
|
||||||
res.addListener("end", function () {
|
res.addListener("end", function () {
|
||||||
resolve({ content: parts.join("") })
|
resolve({content: parts.join("")})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
import Script from "./Script"
|
import Script from "./Script"
|
||||||
import { Overpass } from "../Logic/Osm/Overpass"
|
import {Overpass} from "../Logic/Osm/Overpass"
|
||||||
import { RegexTag } from "../Logic/Tags/RegexTag"
|
import {RegexTag} from "../Logic/Tags/RegexTag"
|
||||||
import { ImmutableStore } from "../Logic/UIEventSource"
|
import {ImmutableStore} from "../Logic/UIEventSource"
|
||||||
import { BBox } from "../Logic/BBox"
|
import {BBox} from "../Logic/BBox"
|
||||||
import * as fs from "fs"
|
import * as fs from "fs"
|
||||||
import { Feature } from "geojson"
|
import {Feature} from "geojson"
|
||||||
import ScriptUtils from "./ScriptUtils"
|
import ScriptUtils from "./ScriptUtils"
|
||||||
import { Imgur } from "../Logic/ImageProviders/Imgur"
|
import {Imgur} from "../Logic/ImageProviders/Imgur"
|
||||||
import { LicenseInfo } from "../Logic/ImageProviders/LicenseInfo"
|
import {LicenseInfo} from "../Logic/ImageProviders/LicenseInfo"
|
||||||
import { Utils } from "../Utils"
|
import {Utils} from "../Utils"
|
||||||
|
import Constants from "../Models/Constants";
|
||||||
|
|
||||||
export default class GenerateImageAnalysis extends Script {
|
export default class GenerateImageAnalysis extends Script {
|
||||||
constructor() {
|
constructor() {
|
||||||
|
@ -17,9 +18,9 @@ export default class GenerateImageAnalysis extends Script {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fetchImages(key: string, datapath: string): Promise<void> {
|
async fetchImages(key: string, datapath: string, refresh: boolean): Promise<void> {
|
||||||
const targetPath = `${datapath}/features_with_${key.replace(/[:\/]/, "_")}.geojson`
|
const targetPath = `${datapath}/features_with_${key.replace(/[:\/]/, "_")}.geojson`
|
||||||
if (fs.existsSync(targetPath)) {
|
if (fs.existsSync(targetPath) && !refresh) {
|
||||||
console.log("Skipping", key)
|
console.log("Skipping", key)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -27,27 +28,26 @@ export default class GenerateImageAnalysis extends Script {
|
||||||
const overpass = new Overpass(
|
const overpass = new Overpass(
|
||||||
tag,
|
tag,
|
||||||
[],
|
[],
|
||||||
"https://overpass.kumi.systems/api/interpreter",
|
Constants.defaultOverpassUrls[0], //"https://overpass.kumi.systems/api/interpreter",
|
||||||
new ImmutableStore(500),
|
new ImmutableStore(500),
|
||||||
undefined,
|
undefined,
|
||||||
false
|
false
|
||||||
)
|
)
|
||||||
console.log("Starting query...")
|
console.log("Starting query...")
|
||||||
const data = await overpass.queryGeoJson(BBox.global)
|
const data = await overpass.queryGeoJson(BBox.global)
|
||||||
console.log("Got data: ", data[0].features.length)
|
console.log("Got data:", data[0].features.length, "items; timestamp:", data[1].toISOString())
|
||||||
fs.writeFileSync(targetPath, JSON.stringify(data[0]), "utf8")
|
fs.writeFileSync(targetPath, JSON.stringify(data[0]), "utf8")
|
||||||
console.log("Written", targetPath)
|
console.log("Written", targetPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
async downloadData(datapath: string): Promise<void> {
|
async downloadData(datapath: string, refresh: boolean): Promise<void> {
|
||||||
if (!fs.existsSync(datapath)) {
|
if (!fs.existsSync(datapath)) {
|
||||||
fs.mkdirSync(datapath)
|
fs.mkdirSync(datapath)
|
||||||
}
|
}
|
||||||
|
await this.fetchImages("image", datapath, refresh)
|
||||||
await this.fetchImages("image", datapath)
|
await this.fetchImages("image:streetsign", datapath, refresh)
|
||||||
await this.fetchImages("image:streetsign", datapath)
|
|
||||||
for (let i = 0; i < 5; i++) {
|
for (let i = 0; i < 5; i++) {
|
||||||
await this.fetchImages("image:" + i, datapath)
|
await this.fetchImages("image:" + i, datapath, refresh)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -73,25 +73,55 @@ export default class GenerateImageAnalysis extends Script {
|
||||||
if (!image.match(/https:\/\/i\.imgur\.com\/[a-zA-Z0-9]+\.jpg/)) {
|
if (!image.match(/https:\/\/i\.imgur\.com\/[a-zA-Z0-9]+\.jpg/)) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
const targetPath = datapath + "/" + image.replace(/[\/:.\-%]/g, "_") + ".json"
|
const filename = image.replace(/[\/:.\-%]/g, "_") + ".json"
|
||||||
|
const targetPath = datapath + "/" + filename
|
||||||
if (fs.existsSync(targetPath)) {
|
if (fs.existsSync(targetPath)) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
const attribution = await Imgur.singleton.DownloadAttribution(image)
|
const attribution = await Imgur.singleton.DownloadAttribution(image)
|
||||||
|
|
||||||
|
if ((attribution.artist ?? "") === "") {
|
||||||
|
// This is an invalid attribution. We save the raw response as well
|
||||||
|
const hash = image.substr("https://i.imgur.com/".length).split(".jpg")[0]
|
||||||
|
|
||||||
|
const apiUrl = "https://api.imgur.com/3/image/" + hash
|
||||||
|
const response = await Utils.downloadJsonCached(apiUrl, 365 * 24 * 60 * 60, {
|
||||||
|
Authorization: "Client-ID " + Constants.ImgurApiKey,
|
||||||
|
})
|
||||||
|
const rawTarget = datapath + "/raw/" + filename
|
||||||
|
console.log("Also storing the raw response to", rawTarget)
|
||||||
|
await fs.writeFileSync(rawTarget, JSON.stringify(response, null, " "))
|
||||||
|
}
|
||||||
|
|
||||||
await fs.writeFileSync(targetPath, JSON.stringify(attribution, null, " "))
|
await fs.writeFileSync(targetPath, JSON.stringify(attribution, null, " "))
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
async downloadMetadata(datapath: string): Promise<void> {
|
loadImageUrls(datapath: string): { allImages: Set<string>, imageSource: Map<string, string> } {
|
||||||
const features = this.loadData(datapath)
|
|
||||||
let allImages = new Set<string>()
|
let allImages = new Set<string>()
|
||||||
|
const features = this.loadData(datapath)
|
||||||
|
let imageSource: Map<string, string> = new Map<string, string>()
|
||||||
|
|
||||||
for (const feature of features) {
|
for (const feature of features) {
|
||||||
allImages.add(feature.properties["image"])
|
allImages.add(feature.properties["image"])
|
||||||
|
imageSource[feature.properties["image"]] = feature.properties.id
|
||||||
|
allImages.add(feature.properties["image:streetsign"])
|
||||||
|
imageSource[feature.properties["image:streetsign"]] = feature.properties.id + " (streetsign)"
|
||||||
|
|
||||||
for (let i = 0; i < 10; i++) {
|
for (let i = 0; i < 10; i++) {
|
||||||
allImages.add(feature.properties["image:" + i])
|
allImages.add(feature.properties["image:" + i])
|
||||||
|
imageSource[feature.properties["image:" + i]] = `${feature.properties.id} (image:${i})`
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
allImages.delete(undefined)
|
||||||
|
allImages.delete(null)
|
||||||
|
imageSource.delete(undefined)
|
||||||
|
imageSource.delete(null)
|
||||||
|
return {allImages, imageSource}
|
||||||
|
}
|
||||||
|
|
||||||
|
async downloadMetadata(datapath: string): Promise<void> {
|
||||||
|
const {allImages, imageSource} = this.loadImageUrls(datapath)
|
||||||
console.log("Detected", allImages.size, "images")
|
console.log("Detected", allImages.size, "images")
|
||||||
let i = 0
|
let i = 0
|
||||||
let d = 0
|
let d = 0
|
||||||
|
@ -113,10 +143,9 @@ export default class GenerateImageAnalysis extends Script {
|
||||||
} downloaded: ${d},skipped: ${s}, failed: ${f}, running: ${Math.floor(
|
} downloaded: ${d},skipped: ${s}, failed: ${f}, running: ${Math.floor(
|
||||||
runningSecs
|
runningSecs
|
||||||
)}sec, ETA: ${estimatedActualMinutes}:${estimatedActualSeconds % 60}`
|
)}sec, ETA: ${estimatedActualMinutes}:${estimatedActualSeconds % 60}`
|
||||||
ScriptUtils.erasableLog(
|
if (d + f % 1000 === 1 || downloaded) {
|
||||||
" ",
|
ScriptUtils.erasableLog(msg)
|
||||||
msg
|
}
|
||||||
)
|
|
||||||
if (downloaded) {
|
if (downloaded) {
|
||||||
d++
|
d++
|
||||||
} else {
|
} else {
|
||||||
|
@ -124,10 +153,80 @@ export default class GenerateImageAnalysis extends Script {
|
||||||
}
|
}
|
||||||
if (d + f == 75000) {
|
if (d + f == 75000) {
|
||||||
console.log("Used 75000 API calls, leaving 5000 for the rest of the day...")
|
console.log("Used 75000 API calls, leaving 5000 for the rest of the day...")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// console.log(e)
|
||||||
|
console.log("Offending image hash is", image, "from https://openstreetmap.org/" + imageSource[image])
|
||||||
|
f++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async downloadImage(url: string, imagePath: string): Promise<boolean> {
|
||||||
|
const filenameLong = url.replace(/[\/:.\-%]/g, "_") + ".jpg"
|
||||||
|
const targetPathLong = imagePath + "/" + filenameLong
|
||||||
|
|
||||||
|
const filename = url.substring("https://i.imgur.com/".length)
|
||||||
|
const targetPath = imagePath + "/" + filename
|
||||||
|
if (fs.existsSync(targetPathLong)) {
|
||||||
|
if (fs.existsSync(targetPath)) {
|
||||||
|
fs.unlinkSync(targetPathLong)
|
||||||
|
console.log("Unlinking duplicate")
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
console.log("Renaming...")
|
||||||
|
fs.renameSync(targetPathLong, targetPath)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if (fs.existsSync(targetPath)) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
await ScriptUtils.DownloadFileTo(url, targetPath)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
async downloadAllImages(datapath: string, imagePath: string): Promise<void> {
|
||||||
|
const {allImages} = this.loadImageUrls(datapath)
|
||||||
|
let skipped = 0
|
||||||
|
let failed = 0
|
||||||
|
let downloaded = 0
|
||||||
|
let invalid = 0
|
||||||
|
const startTime = Date.now()
|
||||||
|
const urls = Array.from(allImages).filter(url => url.startsWith("https://i.imgur.com"))
|
||||||
|
for (const url of urls) {
|
||||||
|
const runningTime = ((Date.now()) - startTime) / 1000
|
||||||
|
const handled = skipped + downloaded + failed
|
||||||
|
const itemsLeft = allImages.size - handled
|
||||||
|
const speed = handled / runningTime
|
||||||
|
const timeLeft = Math.round(itemsLeft * speed)
|
||||||
|
try {
|
||||||
|
const downloadedStatus = await Promise.all(url.split(";").map(url =>
|
||||||
|
this.downloadImage(url.trim(), imagePath),
|
||||||
|
))
|
||||||
|
|
||||||
|
for (const b of downloadedStatus) {
|
||||||
|
if (b) {
|
||||||
|
downloaded += 1
|
||||||
|
} else {
|
||||||
|
skipped += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (downloadedStatus.some(i => i) || skipped % 10000 === 0) {
|
||||||
|
|
||||||
|
console.log("Handled", url, JSON.stringify({
|
||||||
|
skipped,
|
||||||
|
failed,
|
||||||
|
downloaded,
|
||||||
|
invalid,
|
||||||
|
total: allImages.size,
|
||||||
|
eta: timeLeft + "s"
|
||||||
|
}))
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.log(e)
|
console.log(e)
|
||||||
f++
|
failed++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -141,7 +240,7 @@ export default class GenerateImageAnalysis extends Script {
|
||||||
if (!file.endsWith(".json")) {
|
if (!file.endsWith(".json")) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
const attr = <LicenseInfo>JSON.parse(fs.readFileSync(file, { encoding: "utf8" }))
|
const attr = <LicenseInfo>JSON.parse(fs.readFileSync(file, {encoding: "utf8"}))
|
||||||
const license = attr.licenseShortName
|
const license = attr.licenseShortName
|
||||||
|
|
||||||
if (license === undefined || attr.artist === undefined) {
|
if (license === undefined || attr.artist === undefined) {
|
||||||
|
@ -220,7 +319,7 @@ export default class GenerateImageAnalysis extends Script {
|
||||||
...Array.from(licenseByAuthor.get("CC-BY-SA 4.0").values()),
|
...Array.from(licenseByAuthor.get("CC-BY-SA 4.0").values()),
|
||||||
]
|
]
|
||||||
|
|
||||||
console.log("Total number of correctly licenses pictures: ", totalLicensedImages)
|
console.log("Total number of correctly licenses pictures: ", totalLicensedImages, "(out of ", files.length, " images)")
|
||||||
console.log("Total number of authors:", byAuthor.size)
|
console.log("Total number of authors:", byAuthor.size)
|
||||||
console.log(
|
console.log(
|
||||||
"Total number of authors which used a valid, non CC0 license at one point in time",
|
"Total number of authors which used a valid, non CC0 license at one point in time",
|
||||||
|
@ -230,10 +329,15 @@ export default class GenerateImageAnalysis extends Script {
|
||||||
}
|
}
|
||||||
|
|
||||||
async main(args: string[]): Promise<void> {
|
async main(args: string[]): Promise<void> {
|
||||||
|
console.log("Usage: [--cached] to use the cached osm data")
|
||||||
|
console.log("Args are", args)
|
||||||
|
const cached = args.indexOf("--cached") < 0
|
||||||
|
args = args.filter(a => a !== "--cached")
|
||||||
const datapath = args[0] ?? "../../git/MapComplete-data/ImageLicenseInfo"
|
const datapath = args[0] ?? "../../git/MapComplete-data/ImageLicenseInfo"
|
||||||
await this.downloadData(datapath)
|
await this.downloadData(datapath, cached)
|
||||||
|
|
||||||
await this.downloadMetadata(datapath)
|
await this.downloadMetadata(datapath)
|
||||||
|
await this.downloadAllImages(datapath, "/home/pietervdvn/data/imgur-image-backup")
|
||||||
this.analyze(datapath)
|
this.analyze(datapath)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue