ordia
ordia copied to clipboard
Statistics over hyphenation parts
Statistics over hyphenation parts.
The problem seems to be to construct a SPARQL query that will split the string. This does not work:
SELECT * {
?lexeme dct:language wd:Q9035 .
?lexeme ontolex:lexicalForm ?form .
?form wdt:P5279 ?hyphenation .
{ BIND(REPLACE(?hyphenation, "^([^‧]+)‧.*$", "$1") AS ?hyphenation_parts) }
UNION
{ BIND(REPLACE(?hyphenation, "^.+‧([^‧]+).*$", "$1") AS ?hyphenation_parts2) }
}
There is a similiar query by Daniel Mietchen.
https://query.wikidata.org/#SELECT%20%28SAMPLE%28DISTINCT%20%3Fx%29%20AS%20%3Fitem%29%20%3Fw%20%28COUNT%28DISTINCT%20%3Fx%29%20AS%20%3Fc%29%20%28STRLEN%28%3Fw%29%20AS%20%3Fl%29%20WHERE%20%7B%0A%20%20%7B%0A%20%20%20%20SELECT%20DISTINCT%20%3Fx%20%3Ftitle%20WHERE%20%7B%0A%20%20%20%20%20%20%3Fx%20schema%3AdateModified%20%3Fdate_modified%20%3B%0A%20%20%20%20%20%20%20%20%20wdt%3AP31%20wd%3AQ13442814%20%3B%0A%20%20%20%20%20%20%20%20%20wdt%3AP1476%20%3Ftitle.%0A%20%20%20%20%20%20BIND%20%28now%28%29%20-%20%3Fdate_modified%20as%20%3Fdate_range%29%0A%20%20%20%20%20%20FILTER%20%28%3Fdate_range%20%3C%2040%29%0A%20%20%20%20%20%20FILTER%28STRLEN%28%3Ftitle%29%20%3E%3D%2010%29%0A%20%20%20%20%7D%0A%20%20%20%20LIMIT%2010000%0A%20%20%7D%0A%20%20FILTER%20NOT%20EXISTS%20%7B%3Fx%20wdt%3AP921%20%3Ftopic%7D%0A%20%20BIND%28LCASE%28%3Ftitle%29%20AS%20%3Fltitle%29%0A%20%20BIND%28REPLACE%28%3Fltitle%2C%20%22%5E.%2A%3F%28%5C%5Cb%5C%5Cw%7B13%2C%7D%5C%5Cb%29.%2A%24%22%2C%20%22%241%22%29%20AS%20%3Fw1%29%0A%20%20BIND%28REPLACE%28STRAFTER%28%3Fltitle%2C%20%3Fw1%29%2C%20%22%5E.%2A%3F%28%5C%5Cb%5C%5Cw%7B13%2C%7D%5C%5Cb%29.%2A%24%22%2C%20%22%241%22%29%20AS%20%3Fw2%29%0A%20%20BIND%28REPLACE%28STRAFTER%28%3Fltitle%2C%20%3Fw2%29%2C%20%22%5E.%2A%3F%28%5C%5Cb%5C%5Cw%7B13%2C%7D%5C%5Cb%29.%2A%24%22%2C%20%22%241%22%29%20AS%20%3Fw3%29%0A%20%20VALUES%20%3Fw_%20%7B%201%202%203%20%7D%0A%20%20BIND%28IF%28%3Fw_%20%3D%201%2C%20%3Fw1%2C%20IF%28%3Fw_%20%3D%202%2C%20%3Fw2%2C%20%3Fw3%29%29%20AS%20%3Fw%29%0A%20%20FILTER%28REGEX%28%3Fw%2C%20%22%5E%5C%5Cw%2B%24%22%29%29%20%23%20since%20%3Fw%20may%20evaluate%20to%20an%20empty%20string%2C%20e.g.%20for%20one-word%20titles%0A%7D%0AGROUP%20BY%20%3Fitem%20%3Fw%0AORDER%20BY%20DESC%28%3Fc%29%20DESC%28%3Fl%29%0ALIMIT%202000
Only includes first and last part:
SELECT ?hyphenation ?hyphenation_parts
WITH {
SELECT ?hyphenation {
?lexeme dct:language wd:Q9035 .
?lexeme ontolex:lexicalForm ?form .
?form wdt:P5279 ?hyphenation .
}
} AS %hyphenations
WHERE {
{
SELECT ?hyphenation ?hyphenation_parts {
INCLUDE %hyphenations
BIND(REPLACE(?hyphenation, "^([^‧]+?)‧.*$", "$1") AS ?hyphenation_parts)
}
}
UNION
{
SELECT ?hyphenation ?hyphenation_parts {
INCLUDE %hyphenations
BIND(REPLACE(?hyphenation, "^.*‧([^‧]+?)$", "$1") AS ?hyphenation_parts)
}
}
}