mapcomplete/Logic/Web/Wikidata.ts

412 lines
14 KiB
TypeScript
Raw Normal View History

2022-09-08 21:40:48 +02:00
import { Utils } from "../../Utils"
import { UIEventSource } from "../UIEventSource"
import * as wds from "wikidata-sdk"
2021-10-02 22:31:16 +02:00
export class WikidataResponse {
public readonly id: string
public readonly labels: Map<string, string>
public readonly descriptions: Map<string, string>
public readonly claims: Map<string, Set<string>>
public readonly wikisites: Map<string, string>
public readonly commons: string
2021-10-02 22:31:16 +02:00
constructor(
id: string,
labels: Map<string, string>,
descriptions: Map<string, string>,
claims: Map<string, Set<string>>,
wikisites: Map<string, string>,
commons: string
) {
this.id = id
this.labels = labels
this.descriptions = descriptions
this.claims = claims
this.wikisites = wikisites
this.commons = commons
}
public static fromJson(entity: any): WikidataResponse {
2021-10-02 22:31:16 +02:00
const labels = new Map<string, string>()
for (const labelName in entity.labels) {
// The labelname is the language code
labels.set(labelName, entity.labels[labelName].value)
}
const descr = new Map<string, string>()
for (const labelName in entity.descriptions) {
// The labelname is the language code
descr.set(labelName, entity.descriptions[labelName].value)
}
2022-09-08 21:40:48 +02:00
const sitelinks = new Map<string, string>()
2021-10-02 22:31:16 +02:00
for (const labelName in entity.sitelinks) {
// labelName is `${language}wiki`
const language = labelName.substring(0, labelName.length - 4)
const title = entity.sitelinks[labelName].title
sitelinks.set(language, title)
}
2021-10-02 22:31:16 +02:00
const commons = sitelinks.get("commons")
sitelinks.delete("commons")
2022-09-08 21:40:48 +02:00
const claims = WikidataResponse.extractClaims(entity.claims)
return new WikidataResponse(entity.id, labels, descr, claims, sitelinks, commons)
}
static extractClaims(claimsJson: any): Map<string, Set<string>> {
2021-11-07 16:34:51 +01:00
const simplified = wds.simplify.claims(claimsJson, {
2022-09-08 21:40:48 +02:00
timeConverter: "simple-day",
})
2021-11-07 16:34:51 +01:00
2022-09-08 21:40:48 +02:00
const claims = new Map<string, Set<string>>()
for (const claimId in simplified) {
const claimsList: any[] = simplified[claimId]
2022-09-08 21:40:48 +02:00
claims.set(claimId, new Set(claimsList))
2021-10-02 22:31:16 +02:00
}
return claims
}
}
export class WikidataLexeme {
id: string
lemma: Map<string, string>
senses: Map<string, string>
claims: Map<string, Set<string>>
constructor(json) {
this.id = json.id
this.claims = WikidataResponse.extractClaims(json.claims)
this.lemma = new Map<string, string>()
for (const language in json.lemmas) {
this.lemma.set(language, json.lemmas[language].value)
}
2021-10-02 22:31:16 +02:00
this.senses = new Map<string, string>()
for (const sense of json.senses) {
const glosses = sense.glosses
for (const language in glosses) {
2021-11-07 16:34:51 +01:00
let previousSenses = this.senses.get(language)
if (previousSenses === undefined) {
previousSenses = ""
2021-11-07 16:34:51 +01:00
} else {
previousSenses = previousSenses + "; "
}
this.senses.set(language, previousSenses + glosses[language].value ?? "")
}
2021-10-02 22:31:16 +02:00
}
}
asWikidataResponse() {
return new WikidataResponse(
this.id,
this.lemma,
this.senses,
this.claims,
new Map(),
undefined
2022-09-08 21:40:48 +02:00
)
}
}
export interface WikidataSearchoptions {
2022-09-08 21:40:48 +02:00
lang?: "en" | string
maxCount?: 20 | number
}
export interface WikidataAdvancedSearchoptions extends WikidataSearchoptions {
2022-09-08 21:40:48 +02:00
instanceOf?: number[]
notInstanceOf?: number[]
}
/**
* Utility functions around wikidata
*/
export default class Wikidata {
2022-09-08 21:40:48 +02:00
private static readonly _identifierPrefixes = ["Q", "L"].map((str) => str.toLowerCase())
private static readonly _prefixesToRemove = [
"https://www.wikidata.org/wiki/Lexeme:",
"https://www.wikidata.org/wiki/",
"http://www.wikidata.org/entity/",
2022-09-08 21:40:48 +02:00
"Lexeme:",
].map((str) => str.toLowerCase())
2022-09-08 21:40:48 +02:00
private static readonly _cache = new Map<
string,
UIEventSource<{ success: WikidataResponse } | { error: any }>
>()
2022-11-14 00:45:23 +01:00
/**
* Same as LoadWikidataEntry, but wrapped into a UIEventSource
* @param value
* @constructor
*/
2022-09-08 21:40:48 +02:00
public static LoadWikidataEntry(
value: string | number
): UIEventSource<{ success: WikidataResponse } | { error: any }> {
2021-10-03 01:38:57 +02:00
const key = this.ExtractKey(value)
const cached = Wikidata._cache.get(key)
if (cached !== undefined) {
2021-10-03 01:38:57 +02:00
return cached
}
const src = UIEventSource.FromPromiseWithErr(Wikidata.LoadWikidataEntryAsync(key))
Wikidata._cache.set(key, src)
2022-09-08 21:40:48 +02:00
return src
2021-10-03 01:38:57 +02:00
}
/**
* Given a search text, searches for the relevant wikidata entries, excluding pages "outside of the main tree", e.g. disambiguation pages.
* Optionally, an 'instance of' can be given to limit the scope, e.g. instanceOf:5 (humans) will only search for humans
*/
2022-09-08 21:40:48 +02:00
public static async searchAdvanced(
text: string,
options: WikidataAdvancedSearchoptions
): Promise<
{
id: string
relevance?: number
label: string
description?: string
}[]
> {
let instanceOf = ""
if (options?.instanceOf !== undefined && options.instanceOf.length > 0) {
2022-09-08 21:40:48 +02:00
const phrases = options.instanceOf.map((q) => `{ ?item wdt:P31/wdt:P279* wd:Q${q}. }`)
instanceOf = "{" + phrases.join(" UNION ") + "}"
}
2022-09-08 21:40:48 +02:00
const forbidden = (options?.notInstanceOf ?? []).concat([17379835]) // blacklist 'wikimedia pages outside of the main knowledge tree', e.g. disambiguation pages
const minusPhrases = forbidden.map((q) => `MINUS {?item wdt:P31/wdt:P279* wd:Q${q} .}`)
const sparql = `SELECT * WHERE {
SERVICE wikibase:mwapi {
bd:serviceParam wikibase:api "EntitySearch" .
bd:serviceParam wikibase:endpoint "www.wikidata.org" .
bd:serviceParam mwapi:search "${text}" .
bd:serviceParam mwapi:language "${options.lang}" .
?item wikibase:apiOutputItem mwapi:item .
?num wikibase:apiOrdinal true .
2022-09-08 21:40:48 +02:00
bd:serviceParam wikibase:limit ${
Math.round(
(options.maxCount ?? 20) * 1.5
) /*Some padding for disambiguation pages */
} .
?label wikibase:apiOutput mwapi:label .
?description wikibase:apiOutput "@description" .
2022-11-14 00:45:23 +01:00
}
${instanceOf}
${minusPhrases.join("\n ")}
} ORDER BY ASC(?num) LIMIT ${options.maxCount ?? 20}`
const url = wds.sparqlQuery(sparql)
const result = await Utils.downloadJson(url)
/*The full uri of the wikidata-item*/
2022-09-08 21:40:48 +02:00
return result.results.bindings.map(({ item, label, description, num }) => ({
relevance: num?.value,
id: item?.value,
label: label?.value,
2022-09-08 21:40:48 +02:00
description: description?.value,
}))
}
2021-10-08 04:33:39 +02:00
public static async search(
search: string,
options?: WikidataSearchoptions,
page = 1
2022-09-08 21:40:48 +02:00
): Promise<
{
id: string
label: string
description: string
}[]
> {
const maxCount = options?.maxCount ?? 20
let pageCount = Math.min(maxCount, 50)
2022-09-08 21:40:48 +02:00
const start = page * pageCount - pageCount
const lang = options?.lang ?? "en"
const url =
"https://www.wikidata.org/w/api.php?action=wbsearchentities&search=" +
search +
"&language=" +
lang +
2022-09-08 21:40:48 +02:00
"&limit=" +
pageCount +
"&continue=" +
start +
"&format=json&uselang=" +
lang +
"&type=item&origin=*" +
2022-09-08 21:40:48 +02:00
"&props=" // props= removes some unused values in the result
const response = await Utils.downloadJsonCached(url, 10000)
const result: any[] = response.search
if (result.length < pageCount) {
// No next page
2022-09-08 21:40:48 +02:00
return result
}
if (result.length < maxCount) {
2022-09-08 21:40:48 +02:00
const newOptions = { ...options }
newOptions.maxCount = maxCount - result.length
2022-09-08 21:40:48 +02:00
result.push(...(await Wikidata.search(search, newOptions, page + 1)))
}
2022-09-08 21:40:48 +02:00
return result
2021-10-08 04:33:39 +02:00
}
2021-10-08 04:33:39 +02:00
public static async searchAndFetch(
search: string,
options?: WikidataAdvancedSearchoptions
): Promise<WikidataResponse[]> {
2021-10-08 04:33:39 +02:00
// We provide some padding to filter away invalid values
const searchResults = await Wikidata.searchAdvanced(search, options)
const maybeResponses = await Promise.all(
2022-09-08 21:40:48 +02:00
searchResults.map(async (r) => {
try {
console.log("Loading ", r.id)
return await Wikidata.LoadWikidataEntry(r.id).AsPromise()
} catch (e) {
console.error(e)
2022-09-08 21:40:48 +02:00
return undefined
}
2022-09-08 21:40:48 +02:00
})
)
return Utils.NoNull(maybeResponses.map((r) => <WikidataResponse>r["success"]))
2021-10-08 04:33:39 +02:00
}
/**
* Gets the 'key' segment from a URL
*
* Wikidata.ExtractKey("https://www.wikidata.org/wiki/Lexeme:L614072") // => "L614072"
2022-04-22 10:59:03 +02:00
* Wikidata.ExtractKey("http://www.wikidata.org/entity/Q55008046") // => "Q55008046"
*/
public static ExtractKey(value: string | number): string {
2021-10-02 22:31:16 +02:00
if (typeof value === "number") {
return "Q" + value
2021-10-02 22:31:16 +02:00
}
if (value === undefined) {
console.error("ExtractKey: value is undefined")
2022-09-08 21:40:48 +02:00
return undefined
2021-10-02 22:31:16 +02:00
}
value = value.trim().toLowerCase()
for (const prefix of Wikidata._prefixesToRemove) {
if (value.startsWith(prefix)) {
value = value.substring(prefix.length)
}
}
if (value.startsWith("http") && value === "") {
2021-10-02 22:31:16 +02:00
// Probably some random link in the image field - we skip it
return undefined
}
for (const identifierPrefix of Wikidata._identifierPrefixes) {
if (value.startsWith(identifierPrefix)) {
2022-09-08 21:40:48 +02:00
const trimmed = value.substring(identifierPrefix.length)
2021-11-07 16:34:51 +01:00
if (trimmed === "") {
return undefined
}
const n = Number(trimmed)
if (isNaN(n)) {
return undefined
}
2022-09-08 21:40:48 +02:00
return value.toUpperCase()
}
2021-10-03 01:38:57 +02:00
}
if (value !== "" && !isNaN(Number(value))) {
return "Q" + value
2021-10-03 01:38:57 +02:00
}
2022-09-08 21:40:48 +02:00
return undefined
2021-10-03 01:38:57 +02:00
}
/**
* Converts 'Q123' into 123, returns undefined if invalid
*
* Wikidata.QIdToNumber("Q123") // => 123
* Wikidata.QIdToNumber(" Q123 ") // => 123
* Wikidata.QIdToNumber(" X123 ") // => undefined
* Wikidata.QIdToNumber(" Q123X ") // => undefined
* Wikidata.QIdToNumber(undefined) // => undefined
* Wikidata.QIdToNumber(123) // => 123
*/
public static QIdToNumber(q: string | number): number | undefined {
2022-09-08 21:40:48 +02:00
if (q === undefined || q === null) {
return
}
2022-09-08 21:40:48 +02:00
if (typeof q === "number") {
return q
}
q = q.trim()
if (!q.startsWith("Q")) {
return
}
q = q.substr(1)
const n = Number(q)
if (isNaN(n)) {
return
}
return n
}
2021-11-07 16:34:51 +01:00
public static IdToArticle(id: string) {
if (id.startsWith("Q")) {
return "https://wikidata.org/wiki/" + id
2021-10-10 23:50:50 +02:00
}
2021-11-07 16:34:51 +01:00
if (id.startsWith("L")) {
return "https://wikidata.org/wiki/Lexeme:" + id
2021-10-10 23:50:50 +02:00
}
2021-11-07 16:34:51 +01:00
throw "Unknown id type: " + id
2021-10-10 23:50:50 +02:00
}
/**
* Build a SPARQL-query, return the result
2022-09-08 21:40:48 +02:00
*
* @param keys: how variables are named. Every key not ending with 'Label' should appear in at least one statement
* @param statements
* @constructor
*/
2022-09-08 21:40:48 +02:00
public static async Sparql<T>(
keys: string[],
statements: string[]
): Promise<(T & Record<string, { type: string; value: string }>)[]> {
const query =
"SELECT " +
keys.map((k) => (k.startsWith("?") ? k : "?" + k)).join(" ") +
"\n" +
"WHERE\n" +
"{\n" +
2022-09-08 21:40:48 +02:00
statements.map((stmt) => (stmt.endsWith(".") ? stmt : stmt + ".")).join("\n") +
' SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }\n' +
"}"
const url = wds.sparqlQuery(query)
const result = await Utils.downloadJsonCached(url, 24 * 60 * 60 * 1000)
return result.results.bindings
}
2021-10-03 01:38:57 +02:00
/**
* Loads a wikidata page
* @returns the entity of the given value
*/
public static async LoadWikidataEntryAsync(value: string | number): Promise<WikidataResponse> {
const id = Wikidata.ExtractKey(value)
if (id === undefined) {
2021-10-03 01:38:57 +02:00
console.warn("Could not extract a wikidata entry from", value)
return undefined
2021-10-02 22:31:16 +02:00
}
2022-09-08 21:40:48 +02:00
const url = "https://www.wikidata.org/wiki/Special:EntityData/" + id + ".json"
const entities = (await Utils.downloadJsonCached(url, 10000)).entities
2021-11-07 16:34:51 +01:00
const firstKey = <string>Array.from(Object.keys(entities))[0] // Roundabout way to fetch the entity; it might have been a redirect
2021-10-13 11:34:25 +02:00
const response = entities[firstKey]
if (id.startsWith("L")) {
// This is a lexeme:
return new WikidataLexeme(response).asWikidataResponse()
}
return WikidataResponse.fromJson(response)
2021-10-02 22:31:16 +02:00
}
2022-04-22 10:59:03 +02:00
}