import { Utils } from "../../Utils" import Wikidata, { WikidataResponse } from "./Wikidata" import { Store, UIEventSource } from "../UIEventSource" export interface FullWikipediaDetails { articleUrl?: string language?: string pagename?: string fullArticle?: string firstParagraph?: string restOfArticle?: string wikidata?: WikidataResponse title?: string } export default class Wikipedia { /** * When getting a wikipedia page data result, some elements (e.g. navigation, infoboxes, ...) should be removed if 'removeInfoBoxes' is set. * We do this based on the classes. This set contains a blacklist of the classes to remove * @private */ private static readonly classesToRemove = [ "shortdescription", "sidebar", "infobox", "infobox_v2", "noprint", "ambox", "mw-editsection", "mw-selflink", "mw-empty-elt", "hatnote", // Often redirects ] private static readonly idsToRemove = ["sjabloon_zie"] private static readonly _cache = new Map>() private static _fullDetailsCache = new Map>() public readonly backend: string constructor(options?: { language?: "en" | string } | { backend?: string }) { this.backend = Wikipedia.getBackendUrl(options ?? {}) } /** * Tries to extract the language and article name from the given string * * Wikipedia.extractLanguageAndName("qsdf") // => undefined * Wikipedia.extractLanguageAndName("nl:Warandeputten") // => {language: "nl", pageName: "Warandeputten"} */ public static extractLanguageAndName(input: string): { language: string; pageName: string } { const matched = input.match("([^:]+):(.*)") if (matched === undefined || matched === null) { return undefined } const [_, language, pageName] = matched return { language, pageName, } } /** * Fetch all useful information for the given entity. * */ public static fetchArticleAndWikidata( wikidataOrPageId: string, preferedLanguage: string ): Store { const cachekey = preferedLanguage + wikidataOrPageId const cached = Wikipedia._fullDetailsCache.get(cachekey) if (cached) { return cached } console.log("Constructing store for", cachekey) const store = new UIEventSource({}, cachekey) Wikipedia._fullDetailsCache.set(cachekey, store) // Are we dealing with a wikidata item? const wikidataId = Wikidata.ExtractKey(wikidataOrPageId) if (!wikidataId) { // We are dealing with a wikipedia identifier, e.g. 'NL:articlename', 'https://nl.wikipedia.org/wiki/article', ... const { language, pageName } = Wikipedia.extractLanguageAndName(wikidataOrPageId) store.data.articleUrl = new Wikipedia({ language }).getPageUrl(pageName) store.data.language = language store.data.pagename = pageName store.data.title = pageName } else { // Jup, this is a wikidata item // Lets fetch the wikidata store.data.title = wikidataId Wikidata.LoadWikidataEntryAsync(wikidataId).then((wikidata) => { store.data.wikidata = wikidata store.ping() // With the wikidata, we can search for the appropriate wikipedia page const preferredLanguage = [ preferedLanguage, "en", Array.from(wikidata.wikisites.keys())[0], ] for (const language of preferredLanguage) { const pagetitle = wikidata.wikisites.get(language) if (pagetitle) { store.data.articleUrl = new Wikipedia({ language }).getPageUrl(pagetitle) store.data.pagename = pagetitle store.data.language = language store.data.title = pagetitle store.ping() break } } }) } // Now that the pageURL has been setup, we can focus on downloading the actual article // We setup a listener. As soon as the article-URL is know, we'll fetch the actual page // This url can either be set by the Wikidata-response or directly if we are dealing with a wikipedia-url store.addCallbackAndRun((data) => { if (data.language === undefined || data.pagename === undefined) { return } const wikipedia = new Wikipedia({ language: data.language }) wikipedia.GetArticleHtml(data.pagename).then((article) => { data.fullArticle = article const content = document.createElement("div") content.innerHTML = article const firstParagraph = content.getElementsByTagName("p").item(0) data.firstParagraph = firstParagraph.innerHTML content.removeChild(firstParagraph) data.restOfArticle = content.innerHTML store.ping() }) return true // unregister }) return store } private static getBackendUrl( options: { language?: "en" | string } | { backend?: "en.wikipedia.org" | string } ): string { let backend = "en.wikipedia.org" if (options["backend"]) { backend = options["backend"] } else if (options["language"]) { backend = `${options["language"] ?? "en"}.wikipedia.org` } if (!backend.startsWith("http")) { backend = "https://" + backend } return backend } /** * Extracts the actual pagename; returns undefined if this came from a different wikimedia entry * * new Wikipedia({backend: "https://wiki.openstreetmap.org"}).extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => "NL:Speelbos" * new Wikipedia().extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => undefined */ public extractPageName(input: string): string | undefined { if (!input.startsWith(this.backend)) { return undefined } input = input.substring(this.backend.length) const matched = input.match("/?wiki/(.+)") if (matched === undefined || matched === null) { return undefined } const [_, pageName] = matched return pageName } public getDataUrl(pageName: string): string { return ( `${this.backend}/w/api.php?action=parse&format=json&origin=*&prop=text&page=` + pageName ) } public getPageUrl(pageName: string): string { return `${this.backend}/wiki/${pageName}` } /** * Textual search of the specified wiki-instance. If searching Wikipedia, we recommend using wikidata.search instead * @param searchTerm */ public async search(searchTerm: string): Promise<{ title: string; snippet: string }[]> { const url = this.backend + "/w/api.php?action=query&format=json&list=search&srsearch=" + encodeURIComponent(searchTerm) return (await Utils.downloadJson(url))["query"]["search"] } /** * Searches via 'index.php' and scrapes the result. * This gives better results then via the API * @param searchTerm */ public async searchViaIndex( searchTerm: string ): Promise<{ title: string; snippet: string; url: string }[]> { const url = `${this.backend}/w/index.php?search=${encodeURIComponent(searchTerm)}&ns0=1` const result = await Utils.downloadAdvanced(url) if (result["redirect"]) { const targetUrl = result["redirect"] // This is an exact match return [ { title: this.extractPageName(targetUrl)?.trim(), url: targetUrl, snippet: "", }, ] } if (result["error"]) { throw "Could not download: " + JSON.stringify(result) } const el = document.createElement("html") el.innerHTML = result["content"].replace(/href="\//g, 'href="' + this.backend + "/") const searchResults = el.getElementsByClassName("mw-search-results") const individualResults = Array.from( searchResults[0]?.getElementsByClassName("mw-search-result") ?? [] ) return individualResults.map((result) => { const toRemove = Array.from(result.getElementsByClassName("searchalttitle")) for (const toRm of toRemove) { toRm.parentElement.removeChild(toRm) } return { title: result .getElementsByClassName("mw-search-result-heading")[0] .textContent.trim(), url: result.getElementsByTagName("a")[0].href, snippet: result.getElementsByClassName("searchresult")[0].textContent, } }) } /** * Returns the innerHTML for the given article as string. * Some cleanup is applied to this. * * This method uses a static, local cache, so each article will be retrieved only once via the network */ public GetArticleHtml(pageName: string): Promise { const cacheKey = this.backend + "/" + pageName if (Wikipedia._cache.has(cacheKey)) { return Wikipedia._cache.get(cacheKey) } const promise = this.GetArticleUncachedAsync(pageName) Wikipedia._cache.set(cacheKey, promise) return promise } private async GetArticleUncachedAsync(pageName: string): Promise { const response = await Utils.downloadJson(this.getDataUrl(pageName)) if (response?.parse?.text === undefined) { return undefined } const html = response["parse"]["text"]["*"] if (html === undefined) { return undefined } const div = document.createElement("div") div.innerHTML = html const content = Array.from(div.children)[0] for (const forbiddenClass of Wikipedia.classesToRemove) { const toRemove = content.getElementsByClassName(forbiddenClass) for (const toRemoveElement of Array.from(toRemove)) { toRemoveElement.parentElement?.removeChild(toRemoveElement) } } for (const forbiddenId of Wikipedia.idsToRemove) { const toRemove = content.querySelector("#" + forbiddenId) toRemove?.parentElement?.removeChild(toRemove) } const links = Array.from(content.getElementsByTagName("a")) // Rewrite relative links to absolute links + open them in a new tab links .filter((link) => link.getAttribute("href")?.startsWith("/") ?? false) .forEach((link) => { link.target = "_blank" // note: link.getAttribute("href") gets the textual value, link.href is the rewritten version which'll contain the host for relative paths link.href = `${this.backend}${link.getAttribute("href")}` }) return content.innerHTML } }