2024-02-22 14:59:05 +01:00
|
|
|
import Script from "../scripts/Script"
|
2024-02-26 16:11:41 +01:00
|
|
|
import { Server } from "./server"
|
2024-02-22 14:59:05 +01:00
|
|
|
import parse from "node-html-parser"
|
2024-03-01 00:50:00 +01:00
|
|
|
import ScriptUtils from "./ScriptUtils"
|
|
|
|
|
2024-02-22 14:59:05 +01:00
|
|
|
class ServerLdScrape extends Script {
|
2024-04-30 15:59:19 +02:00
|
|
|
|
2024-02-22 14:59:05 +01:00
|
|
|
constructor() {
|
|
|
|
super("Starts a server which fetches a webpage and returns embedded LD+JSON")
|
|
|
|
}
|
2024-03-01 00:50:00 +01:00
|
|
|
|
2024-04-24 00:58:22 +02:00
|
|
|
private static async attemptDownload(url: string) {
|
|
|
|
const host = new URL(url).host
|
2024-04-30 15:59:19 +02:00
|
|
|
const random = Math.floor(Math.random()*100)
|
|
|
|
const random1 = Math.floor(Math.random()*100)
|
|
|
|
|
2024-04-24 00:58:22 +02:00
|
|
|
const headers = [
|
|
|
|
{
|
|
|
|
"User-Agent":
|
2024-04-30 15:59:19 +02:00
|
|
|
`Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.${random}.${random1} Safari/537.36`,
|
2024-04-24 00:58:22 +02:00
|
|
|
"accept": "application/html"
|
2024-04-30 15:59:19 +02:00
|
|
|
}
|
|
|
|
/* {
|
2024-04-24 00:58:22 +02:00
|
|
|
"User-Agent": "MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete",
|
|
|
|
"accept": "application/html"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Host: host,
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:122.0) Gecko/20100101 Firefox/122.0",
|
2024-04-30 15:59:19 +02:00
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,* /*;q=0.8", TODO remove space in * /*
|
2024-04-24 00:58:22 +02:00
|
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
|
|
"Alt-Used": host,
|
|
|
|
DNT: 1,
|
|
|
|
"Sec-GPC": 1,
|
|
|
|
"Upgrade-Insecure-Requests": 1,
|
|
|
|
"Sec-Fetch-Dest": "document",
|
|
|
|
"Sec-Fetch-Mode": "navigate",
|
|
|
|
"Sec-Fetch-Site": "cross-site",
|
|
|
|
"Sec-Fetch-User":"?1",
|
|
|
|
"TE": "trailers",
|
|
|
|
Connection: "keep-alive"
|
2024-04-30 15:59:19 +02:00
|
|
|
}*/
|
2024-04-24 00:58:22 +02:00
|
|
|
]
|
|
|
|
for (let i = 0; i < headers.length; i++) {
|
|
|
|
try {
|
|
|
|
|
|
|
|
return await ScriptUtils.Download(
|
|
|
|
url,
|
|
|
|
headers[i],
|
|
|
|
10
|
|
|
|
)
|
|
|
|
} catch (e) {
|
2024-04-30 15:59:19 +02:00
|
|
|
console.error("Could not download", url, "with headers", headers[i], "due to", e)
|
2024-04-24 00:58:22 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-02-22 14:59:05 +01:00
|
|
|
async main(args: string[]): Promise<void> {
|
|
|
|
const port = Number(args[0] ?? 2346)
|
2024-02-22 15:21:04 +01:00
|
|
|
const cache: Record<string, { date: Date; contents: any }> = {}
|
2024-02-22 14:59:05 +01:00
|
|
|
new Server(port, {}, [
|
|
|
|
{
|
|
|
|
mustMatch: "extractgraph",
|
|
|
|
mimetype: "application/ld+json",
|
2024-04-05 17:49:31 +02:00
|
|
|
addHeaders: {
|
2024-04-24 00:58:22 +02:00
|
|
|
"Cache-control": "max-age=3600, public"
|
2024-04-05 17:49:31 +02:00
|
|
|
},
|
2024-02-22 14:59:05 +01:00
|
|
|
async handle(content, searchParams: URLSearchParams) {
|
|
|
|
const url = searchParams.get("url")
|
2024-04-05 17:49:31 +02:00
|
|
|
console.log("URL", url)
|
2024-02-22 15:21:04 +01:00
|
|
|
if (cache[url] !== undefined) {
|
|
|
|
const { date, contents } = cache[url]
|
|
|
|
// In seconds
|
2024-02-27 03:07:48 +01:00
|
|
|
const tdiff = (new Date().getTime() - (date?.getTime() ?? 0)) / 1000
|
2024-04-30 15:59:19 +02:00
|
|
|
if (tdiff < 31 * 24 * 60 * 60) {
|
2024-02-27 03:07:48 +01:00
|
|
|
return JSON.stringify(contents)
|
2024-02-22 15:21:04 +01:00
|
|
|
}
|
2024-02-22 18:58:34 +01:00
|
|
|
}
|
2024-04-13 02:40:21 +02:00
|
|
|
let dloaded: { content: string } | { redirect: string } | "timeout" = {
|
2024-04-24 00:58:22 +02:00
|
|
|
redirect: url
|
2024-04-13 02:40:21 +02:00
|
|
|
}
|
2024-04-24 00:58:22 +02:00
|
|
|
|
2024-03-01 00:50:00 +01:00
|
|
|
do {
|
2024-04-24 00:58:22 +02:00
|
|
|
dloaded = await ServerLdScrape.attemptDownload(dloaded["redirect"])
|
2024-03-01 00:50:00 +01:00
|
|
|
if (dloaded === "timeout") {
|
2024-04-24 00:58:22 +02:00
|
|
|
return "{\"#\":\"timout reached\"}"
|
2024-03-01 00:50:00 +01:00
|
|
|
}
|
2024-04-30 15:59:19 +02:00
|
|
|
if(dloaded === undefined){
|
|
|
|
return undefined
|
|
|
|
}
|
2024-03-01 00:50:00 +01:00
|
|
|
} while (dloaded["redirect"])
|
2024-04-05 17:49:31 +02:00
|
|
|
|
2024-04-13 02:40:21 +02:00
|
|
|
if (dloaded["content"].startsWith("{")) {
|
2024-04-05 17:49:31 +02:00
|
|
|
// This is probably a json
|
|
|
|
const snippet = JSON.parse(dloaded["content"])
|
|
|
|
console.log("Snippet is", snippet)
|
|
|
|
cache[url] = { contents: snippet, date: new Date() }
|
|
|
|
return JSON.stringify(snippet)
|
|
|
|
}
|
|
|
|
|
2024-03-01 00:50:00 +01:00
|
|
|
const parsed = parse(dloaded["content"])
|
2024-02-22 14:59:05 +01:00
|
|
|
const scripts = Array.from(parsed.getElementsByTagName("script"))
|
|
|
|
for (const script of scripts) {
|
|
|
|
const tp = script.attributes["type"]
|
|
|
|
if (tp !== "application/ld+json") {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
try {
|
2024-02-22 18:58:34 +01:00
|
|
|
const snippet = JSON.parse(script.textContent)
|
|
|
|
snippet["@base"] = url
|
2024-02-27 03:07:48 +01:00
|
|
|
cache[url] = { contents: snippet, date: new Date() }
|
2024-02-22 18:58:34 +01:00
|
|
|
|
|
|
|
return JSON.stringify(snippet)
|
2024-02-22 14:59:05 +01:00
|
|
|
} catch (e) {
|
|
|
|
console.error(e)
|
|
|
|
}
|
|
|
|
}
|
2024-04-24 00:58:22 +02:00
|
|
|
}
|
|
|
|
}
|
2024-02-22 14:59:05 +01:00
|
|
|
])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
new ServerLdScrape().run()
|