2021-10-02 17:57:54 +02:00
|
|
|
/**
|
|
|
|
* Some usefull utility functions around the wikipedia API
|
|
|
|
*/
|
|
|
|
import {Utils} from "../../Utils";
|
2021-10-02 22:31:16 +02:00
|
|
|
import {UIEventSource} from "../UIEventSource";
|
2022-05-26 13:23:25 +02:00
|
|
|
import {WikipediaBoxOptions} from "../../UI/Wikipedia/WikipediaBox";
|
2021-10-02 17:57:54 +02:00
|
|
|
|
|
|
|
export default class Wikipedia {
|
|
|
|
|
|
|
|
/**
|
|
|
|
* When getting a wikipedia page data result, some elements (e.g. navigation, infoboxes, ...) should be removed if 'removeInfoBoxes' is set.
|
|
|
|
* We do this based on the classes. This set contains a blacklist of the classes to remove
|
|
|
|
* @private
|
|
|
|
*/
|
|
|
|
private static readonly classesToRemove = [
|
|
|
|
"shortdescription",
|
|
|
|
"sidebar",
|
2021-11-07 16:34:51 +01:00
|
|
|
"infobox", "infobox_v2",
|
2021-10-02 22:31:16 +02:00
|
|
|
"noprint",
|
|
|
|
"ambox",
|
2021-10-02 17:57:54 +02:00
|
|
|
"mw-editsection",
|
2021-10-02 22:31:16 +02:00
|
|
|
"mw-selflink",
|
2021-10-18 20:40:24 +02:00
|
|
|
"mw-empty-elt",
|
2021-10-02 17:57:54 +02:00
|
|
|
"hatnote" // Often redirects
|
|
|
|
]
|
2021-11-07 16:34:51 +01:00
|
|
|
|
2021-10-07 22:06:47 +02:00
|
|
|
private static readonly idsToRemove = [
|
|
|
|
"sjabloon_zie"
|
|
|
|
]
|
2021-10-02 17:57:54 +02:00
|
|
|
|
2021-10-02 22:31:16 +02:00
|
|
|
private static readonly _cache = new Map<string, UIEventSource<{ success: string } | { error: any }>>()
|
2021-11-07 16:34:51 +01:00
|
|
|
|
|
|
|
|
2022-05-27 05:49:21 +02:00
|
|
|
public readonly backend: string;
|
2022-04-30 00:30:15 +02:00
|
|
|
|
2022-05-26 13:23:25 +02:00
|
|
|
constructor(options?: ({ language?: "en" | string } | { backend?: string })) {
|
2022-05-27 05:49:21 +02:00
|
|
|
this.backend = Wikipedia.getBackendUrl(options ?? {});
|
2022-04-30 00:30:15 +02:00
|
|
|
}
|
2022-05-01 20:56:16 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Tries to extract the language and article name from the given string
|
2022-05-26 13:23:25 +02:00
|
|
|
*
|
2022-05-01 20:56:16 +02:00
|
|
|
* Wikipedia.extractLanguageAndName("qsdf") // => undefined
|
2022-05-01 21:05:58 +02:00
|
|
|
* Wikipedia.extractLanguageAndName("nl:Warandeputten") // => {language: "nl", pageName: "Warandeputten"}
|
2022-05-01 20:56:16 +02:00
|
|
|
*/
|
2022-05-26 13:23:25 +02:00
|
|
|
public static extractLanguageAndName(input: string): { language: string, pageName: string } {
|
2022-05-01 20:56:16 +02:00
|
|
|
const matched = input.match("([^:]+):(.*)")
|
2022-05-26 13:23:25 +02:00
|
|
|
if (matched === undefined || matched === null) {
|
2022-05-01 20:56:16 +02:00
|
|
|
return undefined
|
|
|
|
}
|
2022-05-26 13:23:25 +02:00
|
|
|
const [_, language, pageName] = matched
|
2022-05-01 20:56:16 +02:00
|
|
|
return {
|
|
|
|
language, pageName
|
|
|
|
}
|
|
|
|
}
|
2021-10-02 17:57:54 +02:00
|
|
|
|
2022-05-26 13:23:25 +02:00
|
|
|
/**
|
|
|
|
* Extracts the actual pagename; returns undefined if this came from a different wikimedia entry
|
|
|
|
*
|
|
|
|
* new Wikipedia({backend: "https://wiki.openstreetmap.org"}).extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => "NL:Speelbos"
|
|
|
|
* new Wikipedia().extractPageName("https://wiki.openstreetmap.org/wiki/NL:Speelbos") // => undefined
|
|
|
|
*/
|
|
|
|
public extractPageName(input: string):string | undefined{
|
2022-05-27 05:49:21 +02:00
|
|
|
if(!input.startsWith(this.backend)){
|
2022-05-26 13:23:25 +02:00
|
|
|
return undefined
|
|
|
|
}
|
2022-05-27 05:49:21 +02:00
|
|
|
input = input.substring(this.backend.length);
|
2022-05-26 13:23:25 +02:00
|
|
|
|
|
|
|
const matched = input.match("/?wiki/\(.+\)")
|
|
|
|
if (matched === undefined || matched === null) {
|
|
|
|
return undefined
|
|
|
|
}
|
|
|
|
const [_, pageName] = matched
|
|
|
|
return pageName
|
|
|
|
}
|
|
|
|
|
|
|
|
private static getBackendUrl(options: { language?: "en" | string } | { backend?: "en.wikipedia.org" | string }): string {
|
|
|
|
let backend = "en.wikipedia.org"
|
|
|
|
if (options["backend"]) {
|
|
|
|
backend = options["backend"]
|
|
|
|
} else if (options["language"]) {
|
|
|
|
backend = `${options["language"] ?? "en"}.wikipedia.org`
|
|
|
|
}
|
|
|
|
if (!backend.startsWith("http")) {
|
|
|
|
backend = "https://" + backend
|
|
|
|
}
|
|
|
|
return backend
|
|
|
|
}
|
|
|
|
|
|
|
|
public GetArticle(pageName: string, options: WikipediaBoxOptions): UIEventSource<{ success: string } | { error: any }> {
|
2022-05-27 05:49:21 +02:00
|
|
|
const key = this.backend + ":" + pageName + ":" + (options.firstParagraphOnly ?? false)
|
2022-05-26 13:23:25 +02:00
|
|
|
const cached = Wikipedia._cache.get(key)
|
|
|
|
if (cached !== undefined) {
|
|
|
|
return cached
|
|
|
|
}
|
|
|
|
const v = UIEventSource.FromPromiseWithErr(this.GetArticleAsync(pageName, options))
|
|
|
|
Wikipedia._cache.set(key, v)
|
|
|
|
return v;
|
|
|
|
}
|
|
|
|
|
|
|
|
public getDataUrl(pageName: string): string {
|
2022-05-27 05:49:21 +02:00
|
|
|
return `${this.backend}/w/api.php?action=parse&format=json&origin=*&prop=text&page=` + pageName
|
2022-05-26 13:23:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
public getPageUrl(pageName: string): string {
|
2022-05-27 05:49:21 +02:00
|
|
|
return `${this.backend}/wiki/${pageName}`
|
2022-05-26 13:23:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Textual search of the specified wiki-instance. If searching Wikipedia, we recommend using wikidata.search instead
|
|
|
|
* @param searchTerm
|
|
|
|
*/
|
|
|
|
public async search(searchTerm: string): Promise<{ title: string, snippet: string }[]> {
|
2022-05-27 05:49:21 +02:00
|
|
|
const url = this.backend + "/w/api.php?action=query&format=json&list=search&srsearch=" + encodeURIComponent(searchTerm);
|
2022-05-26 13:23:25 +02:00
|
|
|
return (await Utils.downloadJson(url))["query"]["search"];
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Searches via 'index.php' and scrapes the result.
|
|
|
|
* This gives better results then via the API
|
|
|
|
* @param searchTerm
|
|
|
|
*/
|
|
|
|
public async searchViaIndex(searchTerm: string): Promise<{ title: string, snippet: string, url: string } []> {
|
2022-05-27 05:49:21 +02:00
|
|
|
const url = `${this.backend}/w/index.php?search=${encodeURIComponent(searchTerm)}`
|
2022-05-26 13:23:25 +02:00
|
|
|
const result = await Utils.downloadAdvanced(url);
|
|
|
|
if(result["redirect"] ){
|
|
|
|
// This is an exact match
|
|
|
|
return [{
|
2022-05-27 05:49:21 +02:00
|
|
|
title: this.extractPageName(result["redirect"]).trim(),
|
2022-05-26 13:23:25 +02:00
|
|
|
url: result["redirect"],
|
|
|
|
snippet: ""
|
|
|
|
}]
|
|
|
|
}
|
|
|
|
const el = document.createElement('html');
|
2022-05-27 05:49:21 +02:00
|
|
|
el.innerHTML = result["content"].replace(/href="\//g, "href=\""+this.backend+"/");
|
2022-05-26 13:23:25 +02:00
|
|
|
const searchResults = el.getElementsByClassName("mw-search-results")
|
|
|
|
const individualResults = Array.from(searchResults[0]?.getElementsByClassName("mw-search-result") ?? [])
|
|
|
|
return individualResults.map(result => {
|
2022-05-27 05:49:21 +02:00
|
|
|
const toRemove = Array.from(result.getElementsByClassName("searchalttitle"))
|
|
|
|
for (const toRm of toRemove) {
|
|
|
|
toRm.parentElement.removeChild(toRm)
|
|
|
|
}
|
|
|
|
|
2022-05-26 13:23:25 +02:00
|
|
|
return {
|
2022-05-27 05:49:21 +02:00
|
|
|
title: result.getElementsByClassName("mw-search-result-heading")[0].textContent.trim(),
|
2022-05-26 13:23:25 +02:00
|
|
|
url: result.getElementsByTagName("a")[0].href,
|
|
|
|
snippet: result.getElementsByClassName("searchresult")[0].textContent
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
public async GetArticleAsync(pageName: string, options:
|
|
|
|
{
|
|
|
|
firstParagraphOnly?: false | boolean
|
|
|
|
}): Promise<string | undefined> {
|
|
|
|
|
|
|
|
const response = await Utils.downloadJson(this.getDataUrl(pageName))
|
|
|
|
if (response?.parse?.text === undefined) {
|
|
|
|
return undefined
|
|
|
|
}
|
|
|
|
const html = response["parse"]["text"]["*"];
|
|
|
|
if (html === undefined) {
|
|
|
|
return undefined
|
|
|
|
}
|
2021-10-02 17:57:54 +02:00
|
|
|
const div = document.createElement("div")
|
|
|
|
div.innerHTML = html
|
|
|
|
const content = Array.from(div.children)[0]
|
|
|
|
|
|
|
|
for (const forbiddenClass of Wikipedia.classesToRemove) {
|
2021-11-07 16:34:51 +01:00
|
|
|
const toRemove = content.getElementsByClassName(forbiddenClass)
|
2021-10-02 17:57:54 +02:00
|
|
|
for (const toRemoveElement of Array.from(toRemove)) {
|
|
|
|
toRemoveElement.parentElement?.removeChild(toRemoveElement)
|
|
|
|
}
|
|
|
|
}
|
2021-10-02 22:31:16 +02:00
|
|
|
|
2021-10-07 22:06:47 +02:00
|
|
|
for (const forbiddenId of Wikipedia.idsToRemove) {
|
2021-11-07 16:34:51 +01:00
|
|
|
const toRemove = content.querySelector("#" + forbiddenId)
|
2021-10-07 22:06:47 +02:00
|
|
|
toRemove?.parentElement?.removeChild(toRemove)
|
|
|
|
}
|
2021-11-07 16:34:51 +01:00
|
|
|
|
2021-10-07 22:06:47 +02:00
|
|
|
|
2021-10-02 22:31:16 +02:00
|
|
|
const links = Array.from(content.getElementsByTagName("a"))
|
|
|
|
|
|
|
|
// Rewrite relative links to absolute links + open them in a new tab
|
2021-11-07 16:34:51 +01:00
|
|
|
links.filter(link => link.getAttribute("href")?.startsWith("/") ?? false).forEach(link => {
|
2021-10-02 22:31:16 +02:00
|
|
|
link.target = '_blank'
|
|
|
|
// note: link.getAttribute("href") gets the textual value, link.href is the rewritten version which'll contain the host for relative paths
|
2022-05-27 05:49:21 +02:00
|
|
|
link.href = `${this.backend}${link.getAttribute("href")}`;
|
2021-10-02 22:31:16 +02:00
|
|
|
})
|
|
|
|
|
2022-04-30 00:30:15 +02:00
|
|
|
if (options?.firstParagraphOnly) {
|
|
|
|
return content.getElementsByTagName("p").item(0).innerHTML
|
|
|
|
}
|
|
|
|
|
2021-10-02 22:31:16 +02:00
|
|
|
return content.innerHTML
|
2021-10-02 17:57:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|