2021-10-02 22:31:16 +02:00
import { Utils } from "../../Utils"
2023-04-21 16:02:36 +02:00
import { Store , UIEventSource } from "../UIEventSource"
2022-04-22 01:45:54 +02:00
import * as wds from "wikidata-sdk"
2021-10-02 22:31:16 +02:00
2021-10-09 22:40:52 +02:00
export class WikidataResponse {
public readonly id : string
public readonly labels : Map < string , string >
public readonly descriptions : Map < string , string >
public readonly claims : Map < string , Set < string > >
public readonly wikisites : Map < string , string >
public readonly commons : string
2021-10-02 22:31:16 +02:00
2021-10-09 22:40:52 +02:00
constructor (
id : string ,
labels : Map < string , string > ,
descriptions : Map < string , string > ,
claims : Map < string , Set < string > > ,
wikisites : Map < string , string > ,
commons : string
) {
this . id = id
this . labels = labels
this . descriptions = descriptions
this . claims = claims
this . wikisites = wikisites
this . commons = commons
}
public static fromJson ( entity : any ) : WikidataResponse {
2021-10-02 22:31:16 +02:00
const labels = new Map < string , string > ( )
for ( const labelName in entity . labels ) {
// The labelname is the language code
labels . set ( labelName , entity . labels [ labelName ] . value )
}
const descr = new Map < string , string > ( )
for ( const labelName in entity . descriptions ) {
// The labelname is the language code
descr . set ( labelName , entity . descriptions [ labelName ] . value )
}
const sitelinks = new Map < string , string > ( )
for ( const labelName in entity . sitelinks ) {
// labelName is `${language}wiki`
const language = labelName . substring ( 0 , labelName . length - 4 )
const title = entity . sitelinks [ labelName ] . title
sitelinks . set ( language , title )
}
2021-10-09 22:40:52 +02:00
2021-10-02 22:31:16 +02:00
const commons = sitelinks . get ( "commons" )
sitelinks . delete ( "commons" )
2021-10-09 22:40:52 +02:00
const claims = WikidataResponse . extractClaims ( entity . claims )
return new WikidataResponse ( entity . id , labels , descr , claims , sitelinks , commons )
}
static extractClaims ( claimsJson : any ) : Map < string , Set < string > > {
2021-11-07 16:34:51 +01:00
const simplified = wds . simplify . claims ( claimsJson , {
2021-10-18 20:40:24 +02:00
timeConverter : "simple-day" ,
} )
2021-11-07 16:34:51 +01:00
2021-10-02 22:31:16 +02:00
const claims = new Map < string , Set < string > > ( )
2021-10-18 20:40:24 +02:00
for ( const claimId in simplified ) {
const claimsList : any [ ] = simplified [ claimId ]
claims . set ( claimId , new Set ( claimsList ) )
2021-10-02 22:31:16 +02:00
}
2021-10-09 22:40:52 +02:00
return claims
}
}
export class WikidataLexeme {
id : string
lemma : Map < string , string >
senses : Map < string , string >
claims : Map < string , Set < string > >
constructor ( json ) {
this . id = json . id
this . claims = WikidataResponse . extractClaims ( json . claims )
this . lemma = new Map < string , string > ( )
for ( const language in json . lemmas ) {
this . lemma . set ( language , json . lemmas [ language ] . value )
}
2021-10-02 22:31:16 +02:00
2021-10-09 22:40:52 +02:00
this . senses = new Map < string , string > ( )
for ( const sense of json . senses ) {
const glosses = sense . glosses
for ( const language in glosses ) {
2021-11-07 16:34:51 +01:00
let previousSenses = this . senses . get ( language )
if ( previousSenses === undefined ) {
2021-10-09 22:40:52 +02:00
previousSenses = ""
2021-11-07 16:34:51 +01:00
} else {
previousSenses = previousSenses + "; "
2021-10-09 22:40:52 +02:00
}
this . senses . set ( language , previousSenses + glosses [ language ] . value ? ? "" )
}
2021-10-02 22:31:16 +02:00
}
}
2021-10-09 22:40:52 +02:00
asWikidataResponse() {
return new WikidataResponse (
this . id ,
this . lemma ,
this . senses ,
this . claims ,
new Map ( ) ,
undefined
)
}
}
export interface WikidataSearchoptions {
lang ? : "en" | string
maxCount? : 20 | number
}
2022-04-22 01:45:54 +02:00
export interface WikidataAdvancedSearchoptions extends WikidataSearchoptions {
instanceOf? : number [ ]
notInstanceOf? : number [ ]
}
2021-10-09 22:40:52 +02:00
/ * *
* Utility functions around wikidata
* /
export default class Wikidata {
2023-09-27 22:21:35 +02:00
public static readonly neededUrls = [
"https://www.wikidata.org/" ,
"https://wikidata.org/" ,
"https://query.wikidata.org" ,
2023-10-22 00:51:43 +02:00
"https://m.wikidata.org" , // Important: a mobile browser will request m.wikidata.org instead of www.wikidata.org ; this URL needs to be listed for the CSP
2023-09-27 22:21:35 +02:00
]
2021-10-09 22:40:52 +02:00
private static readonly _identifierPrefixes = [ "Q" , "L" ] . map ( ( str ) = > str . toLowerCase ( ) )
2022-04-22 01:45:54 +02:00
private static readonly _prefixesToRemove = [
"https://www.wikidata.org/wiki/Lexeme:" ,
"https://www.wikidata.org/wiki/" ,
"http://www.wikidata.org/entity/" ,
"Lexeme:" ,
] . map ( ( str ) = > str . toLowerCase ( ) )
2023-04-21 16:02:36 +02:00
private static readonly _storeCache = new Map <
2022-09-08 21:40:48 +02:00
string ,
2023-04-21 16:02:36 +02:00
Store < { success : WikidataResponse } | { error : any } >
2022-09-08 21:40:48 +02:00
> ( )
2023-09-27 22:21:35 +02:00
2022-11-14 00:45:23 +01:00
/ * *
* Same as LoadWikidataEntry , but wrapped into a UIEventSource
* @param value
* @constructor
* /
2021-10-09 22:40:52 +02:00
public static LoadWikidataEntry (
value : string | number
2023-04-21 16:02:36 +02:00
) : Store < { success : WikidataResponse } | { error : any } > {
2021-10-03 01:38:57 +02:00
const key = this . ExtractKey ( value )
2023-04-21 16:02:36 +02:00
const cached = Wikidata . _storeCache . get ( key )
if ( cached ) {
2021-10-03 01:38:57 +02:00
return cached
}
const src = UIEventSource . FromPromiseWithErr ( Wikidata . LoadWikidataEntryAsync ( key ) )
2023-04-21 16:02:36 +02:00
Wikidata . _storeCache . set ( key , src )
2021-10-03 01:38:57 +02:00
return src
}
2022-04-22 01:45:54 +02:00
/ * *
* Given a search text , searches for the relevant wikidata entries , excluding pages "outside of the main tree" , e . g . disambiguation pages .
* Optionally , an 'instance of' can be given to limit the scope , e . g . instanceOf :5 ( humans ) will only search for humans
* /
public static async searchAdvanced (
text : string ,
2023-09-20 01:59:48 +02:00
options? : WikidataAdvancedSearchoptions
2022-04-22 01:45:54 +02:00
) : Promise <
{
id : string
relevance? : number
label : string
description? : string
} [ ]
> {
let instanceOf = ""
if ( options ? . instanceOf !== undefined && options . instanceOf . length > 0 ) {
const phrases = options . instanceOf . map ( ( q ) = > ` { ?item wdt:P31/wdt:P279* wd:Q ${ q } . } ` )
instanceOf = "{" + phrases . join ( " UNION " ) + "}"
}
const forbidden = ( options ? . notInstanceOf ? ? [ ] ) . concat ( [ 17379835 ] ) // blacklist 'wikimedia pages outside of the main knowledge tree', e.g. disambiguation pages
const minusPhrases = forbidden . map ( ( q ) = > ` MINUS {?item wdt:P31/wdt:P279* wd:Q ${ q } .} ` )
2022-04-21 12:39:28 +02:00
const sparql = ` SELECT * WHERE {
SERVICE wikibase :mwapi {
bd :serviceParam wikibase :api "EntitySearch" .
2022-04-22 01:45:54 +02:00
bd :serviceParam wikibase :endpoint "www.wikidata.org" .
2023-07-08 02:45:12 +02:00
bd :serviceParam mwapi :search "${text.replace(/\\/g, " \ \ \ \ ").replace(/" / g , '\\"' ) } " .
2022-04-22 01:45:54 +02:00
bd :serviceParam mwapi :language "${options.lang}" .
? item wikibase :apiOutputItem mwapi :item .
? num wikibase :apiOrdinal true .
bd :serviceParam wikibase :limit $ {
Math . round (
2023-09-20 01:59:48 +02:00
( options ? . maxCount ? ? 20 ) * 1.5
2022-04-22 01:45:54 +02:00
) /*Some padding for disambiguation pages */
} .
? label wikibase :apiOutput mwapi :label .
? description wikibase :apiOutput "@description" .
2022-11-14 00:45:23 +01:00
}
2022-04-22 01:45:54 +02:00
$ { instanceOf }
$ { minusPhrases . join ( "\n " ) }
2023-09-20 01:59:48 +02:00
} ORDER BY ASC ( ? num ) LIMIT $ { options ? . maxCount ? ? 20 } `
2022-04-21 12:39:28 +02:00
const url = wds . sparqlQuery ( sparql )
2022-04-22 01:45:54 +02:00
const result = await Utils . downloadJson ( url )
/*The full uri of the wikidata-item*/
return result . results . bindings . map ( ( { item , label , description , num } ) = > ( {
relevance : num?.value ,
id : item?.value ,
label : label?.value ,
description : description?.value ,
} ) )
2022-04-21 12:39:28 +02:00
}
2021-10-09 22:40:52 +02:00
2021-10-08 04:33:39 +02:00
public static async search (
2021-10-09 22:40:52 +02:00
search : string ,
options? : WikidataSearchoptions ,
page = 1
) : Promise <
{
2021-10-08 04:33:39 +02:00
id : string
label : string
description : string
} [ ]
> {
2021-10-09 22:40:52 +02:00
const maxCount = options ? . maxCount ? ? 20
let pageCount = Math . min ( maxCount , 50 )
const start = page * pageCount - pageCount
const lang = options ? . lang ? ? "en"
const url =
"https://www.wikidata.org/w/api.php?action=wbsearchentities&search=" +
search +
"&language=" +
lang +
"&limit=" +
pageCount +
"&continue=" +
start +
"&format=json&uselang=" +
lang +
"&type=item&origin=*" +
"&props=" // props= removes some unused values in the result
2021-11-07 02:23:28 +01:00
const response = await Utils . downloadJsonCached ( url , 10000 )
2021-10-09 22:40:52 +02:00
const result : any [ ] = response . search
if ( result . length < pageCount ) {
// No next page
2021-10-08 04:33:39 +02:00
return result
2021-10-09 22:40:52 +02:00
}
if ( result . length < maxCount ) {
const newOptions = { . . . options }
newOptions . maxCount = maxCount - result . length
result . push ( . . . ( await Wikidata . search ( search , newOptions , page + 1 ) ) )
}
return result
2021-10-08 04:33:39 +02:00
}
2021-10-09 22:40:52 +02:00
2021-10-08 04:33:39 +02:00
public static async searchAndFetch (
2021-10-09 22:40:52 +02:00
search : string ,
2022-04-22 01:45:54 +02:00
options? : WikidataAdvancedSearchoptions
2021-10-09 22:40:52 +02:00
) : Promise < WikidataResponse [ ] > {
2021-10-08 04:33:39 +02:00
// We provide some padding to filter away invalid values
2022-04-22 01:45:54 +02:00
const searchResults = await Wikidata . searchAdvanced ( search , options )
const maybeResponses = await Promise . all (
searchResults . map ( async ( r ) = > {
try {
console . log ( "Loading " , r . id )
return await Wikidata . LoadWikidataEntry ( r . id ) . AsPromise ( )
} catch ( e ) {
console . error ( e )
return undefined
2021-10-09 22:40:52 +02:00
}
2022-04-22 01:45:54 +02:00
} )
2022-09-08 21:40:48 +02:00
)
2022-04-22 01:45:54 +02:00
return Utils . NoNull ( maybeResponses . map ( ( r ) = > < WikidataResponse > r [ "success" ] ) )
2021-10-08 04:33:39 +02:00
}
2021-10-09 22:40:52 +02:00
2022-03-14 22:57:01 +01:00
/ * *
* Gets the 'key' segment from a URL
2022-04-22 01:45:54 +02:00
*
2022-03-14 22:57:01 +01:00
* Wikidata . ExtractKey ( "https://www.wikidata.org/wiki/Lexeme:L614072" ) // => "L614072"
2022-04-22 10:59:03 +02:00
* Wikidata . ExtractKey ( "http://www.wikidata.org/entity/Q55008046" ) // => "Q55008046"
2023-04-21 16:02:36 +02:00
* Wikidata . ExtractKey ( "Q55008046" ) // => "Q55008046"
* Wikidata . ExtractKey ( "A55008046" ) // => undefined
* Wikidata . ExtractKey ( "Q55008046X" ) // => undefined
2022-03-14 22:57:01 +01:00
* /
2021-10-09 22:40:52 +02:00
public static ExtractKey ( value : string | number ) : string {
2021-10-02 22:31:16 +02:00
if ( typeof value === "number" ) {
2021-10-09 22:40:52 +02:00
return "Q" + value
2021-10-02 22:31:16 +02:00
}
2021-10-09 22:40:52 +02:00
if ( value === undefined ) {
console . error ( "ExtractKey: value is undefined" )
return undefined
2021-10-02 22:31:16 +02:00
}
2021-10-09 22:40:52 +02:00
value = value . trim ( ) . toLowerCase ( )
for ( const prefix of Wikidata . _prefixesToRemove ) {
if ( value . startsWith ( prefix ) ) {
value = value . substring ( prefix . length )
}
}
if ( value . startsWith ( "http" ) && value === "" ) {
2021-10-02 22:31:16 +02:00
// Probably some random link in the image field - we skip it
return undefined
}
2021-10-09 22:40:52 +02:00
for ( const identifierPrefix of Wikidata . _identifierPrefixes ) {
if ( value . startsWith ( identifierPrefix ) ) {
const trimmed = value . substring ( identifierPrefix . length )
2021-11-07 16:34:51 +01:00
if ( trimmed === "" ) {
2021-10-09 22:40:52 +02:00
return undefined
}
const n = Number ( trimmed )
if ( isNaN ( n ) ) {
return undefined
}
return value . toUpperCase ( )
}
2021-10-03 01:38:57 +02:00
}
2021-10-09 22:40:52 +02:00
if ( value !== "" && ! isNaN ( Number ( value ) ) ) {
return "Q" + value
2021-10-03 01:38:57 +02:00
}
2021-10-09 22:40:52 +02:00
return undefined
2021-10-03 01:38:57 +02:00
}
2022-04-22 01:45:54 +02:00
/ * *
* Converts 'Q123' into 123 , returns undefined if invalid
*
* Wikidata . QIdToNumber ( "Q123" ) // => 123
* Wikidata . QIdToNumber ( " Q123 " ) // => 123
* Wikidata . QIdToNumber ( " X123 " ) // => undefined
* Wikidata . QIdToNumber ( " Q123X " ) // => undefined
* Wikidata . QIdToNumber ( undefined ) // => undefined
* Wikidata . QIdToNumber ( 123 ) // => 123
* /
public static QIdToNumber ( q : string | number ) : number | undefined {
if ( q === undefined || q === null ) {
return
}
if ( typeof q === "number" ) {
return q
}
q = q . trim ( )
if ( ! q . startsWith ( "Q" ) ) {
return
}
q = q . substr ( 1 )
const n = Number ( q )
if ( isNaN ( n ) ) {
return
}
return n
}
2021-11-07 16:34:51 +01:00
public static IdToArticle ( id : string ) {
if ( id . startsWith ( "Q" ) ) {
return "https://wikidata.org/wiki/" + id
2021-10-10 23:50:50 +02:00
}
2021-11-07 16:34:51 +01:00
if ( id . startsWith ( "L" ) ) {
return "https://wikidata.org/wiki/Lexeme:" + id
2021-10-10 23:50:50 +02:00
}
2021-11-07 16:34:51 +01:00
throw "Unknown id type: " + id
2021-10-10 23:50:50 +02:00
}
2021-10-09 22:40:52 +02:00
2022-08-17 02:42:59 +02:00
/ * *
* Build a SPARQL - query , return the result
*
* @param keys : how variables are named . Every key not ending with 'Label' should appear in at least one statement
* @param statements
* @constructor
* /
public static async Sparql < T > (
keys : string [ ] ,
statements : string [ ]
) : Promise < ( T & Record < string , { type : string ; value : string } > ) [ ] > {
const query =
"SELECT " +
keys . map ( ( k ) = > ( k . startsWith ( "?" ) ? k : "?" + k ) ) . join ( " " ) +
"\n" +
"WHERE\n" +
"{\n" +
statements . map ( ( stmt ) = > ( stmt . endsWith ( "." ) ? stmt : stmt + "." ) ) . join ( "\n" ) +
' SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }\n' +
"}"
const url = wds . sparqlQuery ( query )
const result = await Utils . downloadJsonCached ( url , 24 * 60 * 60 * 1000 )
return result . results . bindings
}
2023-04-21 16:02:36 +02:00
private static _cache = new Map < string , Promise < WikidataResponse > > ( )
2023-09-27 22:21:35 +02:00
2023-04-21 16:02:36 +02:00
public static async LoadWikidataEntryAsync ( value : string | number ) : Promise < WikidataResponse > {
const key = "" + value
const cached = Wikidata . _cache . get ( key )
if ( cached ) {
return cached
}
const uncached = Wikidata . LoadWikidataEntryUncachedAsync ( value )
Wikidata . _cache . set ( key , uncached )
return uncached
}
2023-09-27 22:21:35 +02:00
2021-10-03 01:38:57 +02:00
/ * *
* Loads a wikidata page
* @returns the entity of the given value
* /
2023-04-21 16:02:36 +02:00
private static async LoadWikidataEntryUncachedAsync (
value : string | number
) : Promise < WikidataResponse > {
2021-10-03 01:38:57 +02:00
const id = Wikidata . ExtractKey ( value )
2021-10-09 22:40:52 +02:00
if ( id === undefined ) {
2021-10-03 01:38:57 +02:00
console . warn ( "Could not extract a wikidata entry from" , value )
2022-04-28 00:30:27 +02:00
return undefined
2021-10-02 22:31:16 +02:00
}
2021-10-09 22:40:52 +02:00
const url = "https://www.wikidata.org/wiki/Special:EntityData/" + id + ".json"
2021-11-07 02:23:28 +01:00
const entities = ( await Utils . downloadJsonCached ( url , 10000 ) ) . entities
2021-11-07 16:34:51 +01:00
const firstKey = < string > Array . from ( Object . keys ( entities ) ) [ 0 ] // Roundabout way to fetch the entity; it might have been a redirect
2021-10-13 11:34:25 +02:00
const response = entities [ firstKey ]
2021-10-09 22:40:52 +02:00
if ( id . startsWith ( "L" ) ) {
// This is a lexeme:
return new WikidataLexeme ( response ) . asWikidataResponse ( )
}
return WikidataResponse . fromJson ( response )
2021-10-02 22:31:16 +02:00
}
2022-04-22 10:59:03 +02:00
}