mapcomplete/Logic/Web/Wikidata.ts

381 lines
13 KiB
TypeScript

import {Utils} from "../../Utils";
import {UIEventSource} from "../UIEventSource";
import * as wds from "wikidata-sdk"
export class WikidataResponse {
public readonly id: string
public readonly labels: Map<string, string>
public readonly descriptions: Map<string, string>
public readonly claims: Map<string, Set<string>>
public readonly wikisites: Map<string, string>
public readonly commons: string
constructor(
id: string,
labels: Map<string, string>,
descriptions: Map<string, string>,
claims: Map<string, Set<string>>,
wikisites: Map<string, string>,
commons: string
) {
this.id = id
this.labels = labels
this.descriptions = descriptions
this.claims = claims
this.wikisites = wikisites
this.commons = commons
}
public static fromJson(entity: any): WikidataResponse {
const labels = new Map<string, string>()
for (const labelName in entity.labels) {
// The labelname is the language code
labels.set(labelName, entity.labels[labelName].value)
}
const descr = new Map<string, string>()
for (const labelName in entity.descriptions) {
// The labelname is the language code
descr.set(labelName, entity.descriptions[labelName].value)
}
const sitelinks = new Map<string, string>();
for (const labelName in entity.sitelinks) {
// labelName is `${language}wiki`
const language = labelName.substring(0, labelName.length - 4)
const title = entity.sitelinks[labelName].title
sitelinks.set(language, title)
}
const commons = sitelinks.get("commons")
sitelinks.delete("commons")
const claims = WikidataResponse.extractClaims(entity.claims);
return new WikidataResponse(
entity.id,
labels,
descr,
claims,
sitelinks,
commons
)
}
static extractClaims(claimsJson: any): Map<string, Set<string>> {
const simplified = wds.simplify.claims(claimsJson, {
timeConverter: 'simple-day'
})
const claims = new Map<string, Set<string>>();
for (const claimId in simplified) {
const claimsList: any[] = simplified[claimId]
claims.set(claimId, new Set(claimsList));
}
return claims
}
}
export class WikidataLexeme {
id: string
lemma: Map<string, string>
senses: Map<string, string>
claims: Map<string, Set<string>>
constructor(json) {
this.id = json.id
this.claims = WikidataResponse.extractClaims(json.claims)
this.lemma = new Map<string, string>()
for (const language in json.lemmas) {
this.lemma.set(language, json.lemmas[language].value)
}
this.senses = new Map<string, string>()
for (const sense of json.senses) {
const glosses = sense.glosses
for (const language in glosses) {
let previousSenses = this.senses.get(language)
if (previousSenses === undefined) {
previousSenses = ""
} else {
previousSenses = previousSenses + "; "
}
this.senses.set(language, previousSenses + glosses[language].value ?? "")
}
}
}
asWikidataResponse() {
return new WikidataResponse(
this.id,
this.lemma,
this.senses,
this.claims,
new Map(),
undefined
);
}
}
export interface WikidataSearchoptions {
lang?: "en" | string,
maxCount?: 20 | number
}
export interface WikidataAdvancedSearchoptions extends WikidataSearchoptions {
instanceOf?: number[];
notInstanceOf?: number[]
}
/**
* Utility functions around wikidata
*/
export default class Wikidata {
private static readonly _identifierPrefixes = ["Q", "L"].map(str => str.toLowerCase())
private static readonly _prefixesToRemove = ["https://www.wikidata.org/wiki/Lexeme:",
"https://www.wikidata.org/wiki/",
"http://www.wikidata.org/entity/",
"Lexeme:"].map(str => str.toLowerCase())
private static readonly _cache = new Map<string, UIEventSource<{ success: WikidataResponse } | { error: any }>>()
public static LoadWikidataEntry(value: string | number): UIEventSource<{ success: WikidataResponse } | { error: any }> {
const key = this.ExtractKey(value)
const cached = Wikidata._cache.get(key)
if (cached !== undefined) {
return cached
}
const src = UIEventSource.FromPromiseWithErr(Wikidata.LoadWikidataEntryAsync(key))
Wikidata._cache.set(key, src)
return src;
}
/**
* Given a search text, searches for the relevant wikidata entries, excluding pages "outside of the main tree", e.g. disambiguation pages.
* Optionally, an 'instance of' can be given to limit the scope, e.g. instanceOf:5 (humans) will only search for humans
*/
public static async searchAdvanced(text: string, options: WikidataAdvancedSearchoptions): Promise<{
id: string,
relevance?: number,
label: string,
description?: string
}[]> {
let instanceOf = ""
if (options?.instanceOf !== undefined && options.instanceOf.length > 0) {
const phrases = options.instanceOf.map(q => `{ ?item wdt:P31/wdt:P279* wd:Q${q}. }`)
instanceOf = "{"+ phrases.join(" UNION ") + "}"
}
const forbidden = (options?.notInstanceOf ?? [])
.concat([17379835]) // blacklist 'wikimedia pages outside of the main knowledge tree', e.g. disambiguation pages
const minusPhrases = forbidden.map(q => `MINUS {?item wdt:P31/wdt:P279* wd:Q${q} .}`)
const sparql = `SELECT * WHERE {
SERVICE wikibase:mwapi {
bd:serviceParam wikibase:api "EntitySearch" .
bd:serviceParam wikibase:endpoint "www.wikidata.org" .
bd:serviceParam mwapi:search "${text}" .
bd:serviceParam mwapi:language "${options.lang}" .
?item wikibase:apiOutputItem mwapi:item .
?num wikibase:apiOrdinal true .
bd:serviceParam wikibase:limit ${Math.round((options.maxCount ?? 20) * 1.5) /*Some padding for disambiguation pages */} .
?label wikibase:apiOutput mwapi:label .
?description wikibase:apiOutput "@description" .
}
${instanceOf}
${minusPhrases.join("\n ")}
} ORDER BY ASC(?num) LIMIT ${options.maxCount ?? 20}`
const url = wds.sparqlQuery(sparql)
const result = await Utils.downloadJson(url)
/*The full uri of the wikidata-item*/
return result.results.bindings.map(({item, label, description, num}) => ({
relevance: num?.value,
id: item?.value,
label: label?.value,
description: description?.value
}))
}
public static async search(
search: string,
options?: WikidataSearchoptions,
page = 1
): Promise<{
id: string,
label: string,
description: string
}[]> {
const maxCount = options?.maxCount ?? 20
let pageCount = Math.min(maxCount, 50)
const start = page * pageCount - pageCount;
const lang = (options?.lang ?? "en")
const url =
"https://www.wikidata.org/w/api.php?action=wbsearchentities&search=" +
search +
"&language=" +
lang +
"&limit=" + pageCount + "&continue=" +
start +
"&format=json&uselang=" +
lang +
"&type=item&origin=*" +
"&props=";// props= removes some unused values in the result
const response = await Utils.downloadJsonCached(url, 10000)
const result: any[] = response.search
if (result.length < pageCount) {
// No next page
return result;
}
if (result.length < maxCount) {
const newOptions = {...options}
newOptions.maxCount = maxCount - result.length
result.push(...await Wikidata.search(search,
newOptions,
page + 1
))
}
return result;
}
public static async searchAndFetch(
search: string,
options?: WikidataAdvancedSearchoptions
): Promise<WikidataResponse[]> {
// We provide some padding to filter away invalid values
const searchResults = await Wikidata.searchAdvanced(search, options)
const maybeResponses = await Promise.all(
searchResults.map(async r => {
try {
console.log("Loading ", r.id)
return await Wikidata.LoadWikidataEntry(r.id).AsPromise()
} catch (e) {
console.error(e)
return undefined;
}
}))
return Utils.NoNull(maybeResponses.map(r => <WikidataResponse>r["success"]))
}
/**
* Gets the 'key' segment from a URL
*
* Wikidata.ExtractKey("https://www.wikidata.org/wiki/Lexeme:L614072") // => "L614072"
* Wikidata.ExtractKey("http://www.wikidata.org/entity/Q55008046") // => "Q55008046"
*/
public static ExtractKey(value: string | number): string {
if (typeof value === "number") {
return "Q" + value
}
if (value === undefined) {
console.error("ExtractKey: value is undefined")
return undefined;
}
value = value.trim().toLowerCase()
for (const prefix of Wikidata._prefixesToRemove) {
if (value.startsWith(prefix)) {
value = value.substring(prefix.length)
}
}
if (value.startsWith("http") && value === "") {
// Probably some random link in the image field - we skip it
return undefined
}
for (const identifierPrefix of Wikidata._identifierPrefixes) {
if (value.startsWith(identifierPrefix)) {
const trimmed = value.substring(identifierPrefix.length);
if (trimmed === "") {
return undefined
}
const n = Number(trimmed)
if (isNaN(n)) {
return undefined
}
return value.toUpperCase();
}
}
if (value !== "" && !isNaN(Number(value))) {
return "Q" + value
}
return undefined;
}
/**
* Converts 'Q123' into 123, returns undefined if invalid
*
* Wikidata.QIdToNumber("Q123") // => 123
* Wikidata.QIdToNumber(" Q123 ") // => 123
* Wikidata.QIdToNumber(" X123 ") // => undefined
* Wikidata.QIdToNumber(" Q123X ") // => undefined
* Wikidata.QIdToNumber(undefined) // => undefined
* Wikidata.QIdToNumber(123) // => 123
*/
public static QIdToNumber(q: string | number): number | undefined {
if(q === undefined || q === null){
return
}
if(typeof q === "number"){
return q
}
q = q.trim()
if (!q.startsWith("Q")) {
return
}
q = q.substr(1)
const n = Number(q)
if (isNaN(n)) {
return
}
return n
}
public static IdToArticle(id: string) {
if (id.startsWith("Q")) {
return "https://wikidata.org/wiki/" + id
}
if (id.startsWith("L")) {
return "https://wikidata.org/wiki/Lexeme:" + id
}
throw "Unknown id type: " + id
}
/**
* Loads a wikidata page
* @returns the entity of the given value
*/
public static async LoadWikidataEntryAsync(value: string | number): Promise<WikidataResponse> {
const id = Wikidata.ExtractKey(value)
if (id === undefined) {
console.warn("Could not extract a wikidata entry from", value)
return undefined
}
const url = "https://www.wikidata.org/wiki/Special:EntityData/" + id + ".json";
const entities = (await Utils.downloadJsonCached(url, 10000)).entities
const firstKey = <string>Array.from(Object.keys(entities))[0] // Roundabout way to fetch the entity; it might have been a redirect
const response = entities[firstKey]
if (id.startsWith("L")) {
// This is a lexeme:
return new WikidataLexeme(response).asWikidataResponse()
}
return WikidataResponse.fromJson(response)
}
}