import { Utils } from "../../Utils" import { UIEventSource } from "../UIEventSource" import * as wds from "wikidata-sdk" export class WikidataResponse { public readonly id: string public readonly labels: Map public readonly descriptions: Map public readonly claims: Map> public readonly wikisites: Map public readonly commons: string constructor( id: string, labels: Map, descriptions: Map, claims: Map>, wikisites: Map, commons: string ) { this.id = id this.labels = labels this.descriptions = descriptions this.claims = claims this.wikisites = wikisites this.commons = commons } public static fromJson(entity: any): WikidataResponse { const labels = new Map() for (const labelName in entity.labels) { // The labelname is the language code labels.set(labelName, entity.labels[labelName].value) } const descr = new Map() for (const labelName in entity.descriptions) { // The labelname is the language code descr.set(labelName, entity.descriptions[labelName].value) } const sitelinks = new Map() for (const labelName in entity.sitelinks) { // labelName is `${language}wiki` const language = labelName.substring(0, labelName.length - 4) const title = entity.sitelinks[labelName].title sitelinks.set(language, title) } const commons = sitelinks.get("commons") sitelinks.delete("commons") const claims = WikidataResponse.extractClaims(entity.claims) return new WikidataResponse(entity.id, labels, descr, claims, sitelinks, commons) } static extractClaims(claimsJson: any): Map> { const simplified = wds.simplify.claims(claimsJson, { timeConverter: "simple-day", }) const claims = new Map>() for (const claimId in simplified) { const claimsList: any[] = simplified[claimId] claims.set(claimId, new Set(claimsList)) } return claims } } export class WikidataLexeme { id: string lemma: Map senses: Map claims: Map> constructor(json) { this.id = json.id this.claims = WikidataResponse.extractClaims(json.claims) this.lemma = new Map() for (const language in json.lemmas) { this.lemma.set(language, json.lemmas[language].value) } this.senses = new Map() for (const sense of json.senses) { const glosses = sense.glosses for (const language in glosses) { let previousSenses = this.senses.get(language) if (previousSenses === undefined) { previousSenses = "" } else { previousSenses = previousSenses + "; " } this.senses.set(language, previousSenses + glosses[language].value ?? "") } } } asWikidataResponse() { return new WikidataResponse( this.id, this.lemma, this.senses, this.claims, new Map(), undefined ) } } export interface WikidataSearchoptions { lang?: "en" | string maxCount?: 20 | number } export interface WikidataAdvancedSearchoptions extends WikidataSearchoptions { instanceOf?: number[] notInstanceOf?: number[] } /** * Utility functions around wikidata */ export default class Wikidata { private static readonly _identifierPrefixes = ["Q", "L"].map((str) => str.toLowerCase()) private static readonly _prefixesToRemove = [ "https://www.wikidata.org/wiki/Lexeme:", "https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/", "Lexeme:", ].map((str) => str.toLowerCase()) private static readonly _cache = new Map< string, UIEventSource<{ success: WikidataResponse } | { error: any }> >() /** * Same as LoadWikidataEntry, but wrapped into a UIEventSource * @param value * @constructor */ public static LoadWikidataEntry( value: string | number ): UIEventSource<{ success: WikidataResponse } | { error: any }> { const key = this.ExtractKey(value) const cached = Wikidata._cache.get(key) if (cached !== undefined) { return cached } const src = UIEventSource.FromPromiseWithErr(Wikidata.LoadWikidataEntryAsync(key)) Wikidata._cache.set(key, src) return src } /** * Given a search text, searches for the relevant wikidata entries, excluding pages "outside of the main tree", e.g. disambiguation pages. * Optionally, an 'instance of' can be given to limit the scope, e.g. instanceOf:5 (humans) will only search for humans */ public static async searchAdvanced( text: string, options: WikidataAdvancedSearchoptions ): Promise< { id: string relevance?: number label: string description?: string }[] > { let instanceOf = "" if (options?.instanceOf !== undefined && options.instanceOf.length > 0) { const phrases = options.instanceOf.map((q) => `{ ?item wdt:P31/wdt:P279* wd:Q${q}. }`) instanceOf = "{" + phrases.join(" UNION ") + "}" } const forbidden = (options?.notInstanceOf ?? []).concat([17379835]) // blacklist 'wikimedia pages outside of the main knowledge tree', e.g. disambiguation pages const minusPhrases = forbidden.map((q) => `MINUS {?item wdt:P31/wdt:P279* wd:Q${q} .}`) const sparql = `SELECT * WHERE { SERVICE wikibase:mwapi { bd:serviceParam wikibase:api "EntitySearch" . bd:serviceParam wikibase:endpoint "www.wikidata.org" . bd:serviceParam mwapi:search "${text}" . bd:serviceParam mwapi:language "${options.lang}" . ?item wikibase:apiOutputItem mwapi:item . ?num wikibase:apiOrdinal true . bd:serviceParam wikibase:limit ${ Math.round( (options.maxCount ?? 20) * 1.5 ) /*Some padding for disambiguation pages */ } . ?label wikibase:apiOutput mwapi:label . ?description wikibase:apiOutput "@description" . } ${instanceOf} ${minusPhrases.join("\n ")} } ORDER BY ASC(?num) LIMIT ${options.maxCount ?? 20}` const url = wds.sparqlQuery(sparql) const result = await Utils.downloadJson(url) /*The full uri of the wikidata-item*/ return result.results.bindings.map(({ item, label, description, num }) => ({ relevance: num?.value, id: item?.value, label: label?.value, description: description?.value, })) } public static async search( search: string, options?: WikidataSearchoptions, page = 1 ): Promise< { id: string label: string description: string }[] > { const maxCount = options?.maxCount ?? 20 let pageCount = Math.min(maxCount, 50) const start = page * pageCount - pageCount const lang = options?.lang ?? "en" const url = "https://www.wikidata.org/w/api.php?action=wbsearchentities&search=" + search + "&language=" + lang + "&limit=" + pageCount + "&continue=" + start + "&format=json&uselang=" + lang + "&type=item&origin=*" + "&props=" // props= removes some unused values in the result const response = await Utils.downloadJsonCached(url, 10000) const result: any[] = response.search if (result.length < pageCount) { // No next page return result } if (result.length < maxCount) { const newOptions = { ...options } newOptions.maxCount = maxCount - result.length result.push(...(await Wikidata.search(search, newOptions, page + 1))) } return result } public static async searchAndFetch( search: string, options?: WikidataAdvancedSearchoptions ): Promise { // We provide some padding to filter away invalid values const searchResults = await Wikidata.searchAdvanced(search, options) const maybeResponses = await Promise.all( searchResults.map(async (r) => { try { console.log("Loading ", r.id) return await Wikidata.LoadWikidataEntry(r.id).AsPromise() } catch (e) { console.error(e) return undefined } }) ) return Utils.NoNull(maybeResponses.map((r) => r["success"])) } /** * Gets the 'key' segment from a URL * * Wikidata.ExtractKey("https://www.wikidata.org/wiki/Lexeme:L614072") // => "L614072" * Wikidata.ExtractKey("http://www.wikidata.org/entity/Q55008046") // => "Q55008046" */ public static ExtractKey(value: string | number): string { if (typeof value === "number") { return "Q" + value } if (value === undefined) { console.error("ExtractKey: value is undefined") return undefined } value = value.trim().toLowerCase() for (const prefix of Wikidata._prefixesToRemove) { if (value.startsWith(prefix)) { value = value.substring(prefix.length) } } if (value.startsWith("http") && value === "") { // Probably some random link in the image field - we skip it return undefined } for (const identifierPrefix of Wikidata._identifierPrefixes) { if (value.startsWith(identifierPrefix)) { const trimmed = value.substring(identifierPrefix.length) if (trimmed === "") { return undefined } const n = Number(trimmed) if (isNaN(n)) { return undefined } return value.toUpperCase() } } if (value !== "" && !isNaN(Number(value))) { return "Q" + value } return undefined } /** * Converts 'Q123' into 123, returns undefined if invalid * * Wikidata.QIdToNumber("Q123") // => 123 * Wikidata.QIdToNumber(" Q123 ") // => 123 * Wikidata.QIdToNumber(" X123 ") // => undefined * Wikidata.QIdToNumber(" Q123X ") // => undefined * Wikidata.QIdToNumber(undefined) // => undefined * Wikidata.QIdToNumber(123) // => 123 */ public static QIdToNumber(q: string | number): number | undefined { if (q === undefined || q === null) { return } if (typeof q === "number") { return q } q = q.trim() if (!q.startsWith("Q")) { return } q = q.substr(1) const n = Number(q) if (isNaN(n)) { return } return n } public static IdToArticle(id: string) { if (id.startsWith("Q")) { return "https://wikidata.org/wiki/" + id } if (id.startsWith("L")) { return "https://wikidata.org/wiki/Lexeme:" + id } throw "Unknown id type: " + id } /** * Build a SPARQL-query, return the result * * @param keys: how variables are named. Every key not ending with 'Label' should appear in at least one statement * @param statements * @constructor */ public static async Sparql( keys: string[], statements: string[] ): Promise<(T & Record)[]> { const query = "SELECT " + keys.map((k) => (k.startsWith("?") ? k : "?" + k)).join(" ") + "\n" + "WHERE\n" + "{\n" + statements.map((stmt) => (stmt.endsWith(".") ? stmt : stmt + ".")).join("\n") + ' SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }\n' + "}" const url = wds.sparqlQuery(query) const result = await Utils.downloadJsonCached(url, 24 * 60 * 60 * 1000) return result.results.bindings } /** * Loads a wikidata page * @returns the entity of the given value */ public static async LoadWikidataEntryAsync(value: string | number): Promise { const id = Wikidata.ExtractKey(value) if (id === undefined) { console.warn("Could not extract a wikidata entry from", value) return undefined } const url = "https://www.wikidata.org/wiki/Special:EntityData/" + id + ".json" const entities = (await Utils.downloadJsonCached(url, 10000)).entities const firstKey = Array.from(Object.keys(entities))[0] // Roundabout way to fetch the entity; it might have been a redirect const response = entities[firstKey] if (id.startsWith("L")) { // This is a lexeme: return new WikidataLexeme(response).asWikidataResponse() } return WikidataResponse.fromJson(response) } }