2024-02-26 02:24:46 +01:00
|
|
|
import fs from "fs"
|
2024-02-29 14:54:14 +01:00
|
|
|
import readline from "readline"
|
2024-02-26 02:24:46 +01:00
|
|
|
import Script from "../Script"
|
|
|
|
import LinkedDataLoader from "../../src/Logic/Web/LinkedDataLoader"
|
|
|
|
import UrlValidator from "../../src/UI/InputElement/Validators/UrlValidator"
|
|
|
|
// vite-node scripts/importscripts/compareWebsiteData.ts -- ~/Downloads/ShopsWithWebsiteNodes.csv ~/data/scraped_websites/
|
|
|
|
class CompareWebsiteData extends Script {
|
|
|
|
constructor() {
|
2024-04-13 02:40:21 +02:00
|
|
|
super(
|
|
|
|
"Given a csv file with 'id', 'tags' and 'website', attempts to fetch jsonld and compares the attributes. Usage: csv-file datadir"
|
|
|
|
)
|
2024-02-26 02:24:46 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
private readonly urlFormatter = new UrlValidator()
|
2024-04-13 02:40:21 +02:00
|
|
|
async getWithCache(cachedir: string, url: string): Promise<any> {
|
|
|
|
const filename = cachedir + "/" + encodeURIComponent(url)
|
|
|
|
if (fs.existsSync(filename)) {
|
2024-02-26 02:24:46 +01:00
|
|
|
return JSON.parse(fs.readFileSync(filename, "utf-8"))
|
|
|
|
}
|
2024-06-19 03:22:57 +02:00
|
|
|
const jsonLd = await LinkedDataLoader.fetchJsonLd(url, undefined, "proxy")
|
2024-02-26 02:24:46 +01:00
|
|
|
console.log("Got:", jsonLd)
|
|
|
|
fs.writeFileSync(filename, JSON.stringify(jsonLd))
|
|
|
|
return jsonLd
|
|
|
|
}
|
2024-04-13 02:40:21 +02:00
|
|
|
async handleEntry(line: string, cachedir: string, targetfile: string): Promise<boolean> {
|
2024-02-26 02:24:46 +01:00
|
|
|
const id = JSON.parse(line.split(",")[0])
|
|
|
|
let tags = line.substring(line.indexOf("{") - 1)
|
|
|
|
tags = tags.substring(1, tags.length - 1)
|
2024-04-13 02:40:21 +02:00
|
|
|
tags = tags.replace(/""/g, '"')
|
2024-02-26 02:24:46 +01:00
|
|
|
const data = JSON.parse(tags)
|
|
|
|
|
2024-04-13 02:40:21 +02:00
|
|
|
try {
|
|
|
|
const website = this.urlFormatter.reformat(data.website)
|
|
|
|
console.log(website)
|
|
|
|
const jsonld = await this.getWithCache(cachedir, website)
|
|
|
|
if (Object.keys(jsonld).length === 0) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
const diff = LinkedDataLoader.removeDuplicateData(jsonld, data)
|
|
|
|
fs.appendFileSync(targetfile, id + ", " + JSON.stringify(diff) + "\n\n")
|
|
|
|
return true
|
|
|
|
} catch (e) {
|
2024-02-29 14:54:14 +01:00
|
|
|
console.error("Could not download ", data.website)
|
|
|
|
}
|
2024-02-26 02:24:46 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
async main(args: string[]): Promise<void> {
|
|
|
|
if (args.length < 2) {
|
|
|
|
throw "Not enough arguments"
|
|
|
|
}
|
|
|
|
|
|
|
|
const readInterface = readline.createInterface({
|
|
|
|
input: fs.createReadStream(args[0]),
|
|
|
|
})
|
|
|
|
|
|
|
|
let handled = 0
|
|
|
|
let diffed = 0
|
|
|
|
const targetfile = "diff.csv"
|
|
|
|
fs.writeFileSync(targetfile, "id, diff-json\n")
|
|
|
|
for await (const line of readInterface) {
|
|
|
|
try {
|
2024-04-13 02:40:21 +02:00
|
|
|
if (line.startsWith('"id"')) {
|
2024-02-26 02:24:46 +01:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
const madeComparison = await this.handleEntry(line, args[1], targetfile)
|
2024-04-13 02:40:21 +02:00
|
|
|
handled++
|
2024-02-26 02:24:46 +01:00
|
|
|
diffed = diffed + (madeComparison ? 1 : 0)
|
2024-04-13 02:40:21 +02:00
|
|
|
if (handled % 1000 == 0) {
|
|
|
|
console.log("Handled ", handled, " got ", diffed, "diff results")
|
2024-02-26 02:24:46 +01:00
|
|
|
}
|
|
|
|
} catch (e) {
|
2024-04-13 02:40:21 +02:00
|
|
|
// console.error(e)
|
2024-02-26 02:24:46 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
new CompareWebsiteData().run()
|