Skip to content

Commit

Permalink
better extract inter language links
Browse files Browse the repository at this point in the history
  • Loading branch information
sven-h committed Apr 3, 2019
1 parent 0db26a7 commit 48d1845
Show file tree
Hide file tree
Showing 12 changed files with 319 additions and 38 deletions.
13 changes: 12 additions & 1 deletion core/src/main/resources/datasetdefinitions.json
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,12 @@
"desc": "Dataset linking a DBpedia resource to the same resource in other languages and in Wikidata.",
"defaultgraph": "dataset"
},
"interwiki_links": {
"name": "InterWiki Links",
"traits":"LinkedData, Published",
"desc": "Dataset linking a DBpedia resource to the same resource in other wikis.",
"defaultgraph": "dataset"
},
"interlanguage_links_chapters": {
"name": "Interlanguage Links between DBpedia Chapters",
"traits":"LinkedData, Published",
Expand Down Expand Up @@ -268,8 +274,13 @@
"traits":"LinkedData, Published",
"desc": "Definition of predicates extracted from xml tags",
"defaultgraph": "namespace"
},
"interwiki_sameas": {
"name": "Interwiki Sameas Links",
"traits":"LinkedData, Published",
"desc": "Links between wiki articles which share the same meaning",
"defaultgraph": "namespace"
}

},
"mappings_based":{
"instance_types": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ object DBpediaDatasets extends java.io.Serializable
val RevisionMeta: Dataset = datasets("revision_meta")
val PageIds: Dataset = datasets("page_ids")
val InterLanguageLinks: Dataset = datasets("interlanguage_links") // Since the inter-language links were moved from Wikipedia to Wikidata, we now extract these links from the Wikidata dump, not from Wikipedia pages.")
val InterWikiLinks: Dataset = datasets("interwiki_links")
val InterLanguageLinksChapter: Dataset = datasets("interlanguage_links_chapters")
val Genders: Dataset = datasets("genders")
val TopicalConcepts: Dataset = datasets("topical_concepts")
Expand All @@ -177,6 +178,7 @@ object DBpediaDatasets extends java.io.Serializable
val RelationExtraction: Dataset = datasets("relation_extraction")
val TagProperties: Dataset = datasets("tag_properties")
val TagPropertiesDefinitions: Dataset = datasets("tag_property_definitions")
val InterWikiSameAs: Dataset = datasets("interwiki_sameas")

/**
* Mapping based
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@ class InterLanguageLinksExtractor(context: { def ontology : Ontology; def langua
{
private val sameAsProperty = context.ontology.properties("owl:sameAs")

override val datasets = Set(DBpediaDatasets.InterLanguageLinks)
override val datasets = Set(DBpediaDatasets.InterLanguageLinks,DBpediaDatasets.InterWikiLinks)

private val namespaces = if (context.language == Language.Commons) ExtractorUtils.commonsNamespacesContainingMetadata
else Set(Namespace.Main, Namespace.Template, Namespace.Category)

private val quad = QuadBuilder.apply(context.language, DBpediaDatasets.InterLanguageLinks, sameAsProperty, null) _
private val quadInterLang = QuadBuilder.apply(context.language, DBpediaDatasets.InterLanguageLinks, sameAsProperty, null) _
val wikiPageWikiLinkProperty = context.ontology.properties("wikiPageInterWikiLink")
private val quadInterWiki = QuadBuilder.apply(context.language, DBpediaDatasets.InterWikiLinks, wikiPageWikiLinkProperty, null) _

override def extract(page : PageNode, subjectUri : String) : Seq[Quad] =
{
Expand All @@ -34,8 +36,9 @@ class InterLanguageLinksExtractor(context: { def ontology : Ontology; def langua
case link: InterWikiLinkNode => {
val dst = link.destination
if (dst.isInterLanguageLink) {
val dstLang = dst.language
quads += quad(subjectUri, dstLang.resourceUri.append(dst.decodedWithNamespace), link.sourceIri)
quads += quadInterLang(subjectUri, dst.language.resourceUri.append(dst.decodedWithNamespace), link.sourceIri)
}else{
quads += quadInterWiki(subjectUri, dst.language.resourceUri.append(dst.decodedWithNamespace), link.sourceIri)
}
}
case _ => // ignore
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
package org.dbpedia.extraction.mappings

import org.dbpedia.extraction.config.provenance.DBpediaDatasets
import org.dbpedia.extraction.ontology.{DBpediaNamespace, Ontology}
import org.dbpedia.extraction.transform.{Quad, QuadBuilder}
import org.dbpedia.extraction.util.Language
import org.dbpedia.extraction.wikiparser._

import scala.collection.mutable.{ArrayBuffer, HashSet}
import scala.language.reflectiveCalls

/**
* This extractor extracts all templates that exist in an article.
* This data can be used for Wikipedia administrative tasks.
*/
class InterWikiSameAsExtraction(
context: {
def ontology: Ontology
def language: Language
}
) extends PageNodeExtractor {

private val sameAsProperty = context.ontology.properties("owl:sameAs")

override val datasets = Set(DBpediaDatasets.InterWikiSameAs)

private val quad = QuadBuilder.apply(context.language, DBpediaDatasets.InterWikiSameAs, sameAsProperty, null) _

private val dbpediaNamespace = new DBpediaNamespace("http://dbpedia.org/resource/")

override def extract(node: PageNode, subjectUri: String): Seq[Quad] = {
//Only extract for pages from the Main namespace
if(node.title.namespace != Namespace.Main) return Seq.empty
//Don't extract from redirect and disambiguation pages
if(node.isRedirect || node.isDisambiguation) return Seq.empty

var quads = new ArrayBuffer[Quad]()

//check for huge wiki:
//https://community.fandom.com/wiki/Hub:Big_wikis

var isWikipedia = false;
var wikipediaTitle = "";

var templateNodes = collectTemplatesTopLevel(node)
//list all templates:
// index.php?title=Special%3AAllPages&namespace=10
// /api.php?action=query&format=json&list=allpages&apnamespace=10&aplimit=max
// /api.php?action=query&format=json&list=allpages&apnamespace=10&aplimit=max&apfrom=W

//where are the templates used:
// /wiki/Special:WhatLinksHere
// /index.php?title=Special%3AWhatLinksHere&target=Template%3AWikipedia-title&namespace=0
for(template <- templateNodes){
val name = template.title.encoded.toLowerCase

if(name == "wikipedia" || name == "wikipedia-title" || name == "wikilink" || name == "wp" || name =="enwp" || name =="usedwp" || name =="person-enwp"){
//http://military.wikia.com/wiki/Template:Wikipedia
//https://memory-alpha.fandom.com/wiki/Template:Wikipedia-title
//https://harrypotter.fandom.com/wiki/Template:Wikilink
//https://disney.fandom.com/wiki/Template:WP
//http://familypedia.wikia.com/wiki/Template:EnWP
//http://familypedia.wikia.com/wiki/Template:Usedwp
//http://familypedia.wikia.com/wiki/Template:person-enWP
isWikipedia = true
wikipediaTitle = getNormalizedOptionalNodeValueOrDefault(template.property("1"), node.title.decoded)
} else if(name == "wp-song"){
//lyrics
//http://lyrics.wikia.com/wiki/Template:WP-Song
isWikipedia = true
wikipediaTitle = getNormalizedOptionalNodeValueOrDefault(template.property("1"), "")
}else if(name == "songheader" && wikipediaTitle.isEmpty){
//lyrics
wikipediaTitle = getNormalizedOptionalNodeValueOrDefault(template.property("song"), wikipediaTitle)
}else if(name == "interlang"){
for(lang <- template.keySet){
var title = getNormalizedOptionalNodeValueOrDefault(template.property(lang), "")
if(title.nonEmpty){
Language.get(lang) match {
case Some(x) => quads += quad(subjectUri, x.resourceUri.append(title), node.sourceIri)
case None =>
}
}
}
}

//familypedia:
//http://familypedia.wikia.com/wiki/Inglewood,_California?action=edit
//http://familypedia.wikia.com/wiki/Template:Usedwp
//http://familypedia.wikia.com/wiki/Hanna_Emerson_(1657-1738)?action=edit
//http://familypedia.wikia.com/wiki/Template:person-enWP
//http://familypedia.wikia.com/wiki/Template:EnWP
//http://familypedia.wikia.com/wiki/Template:FrWP

//military:
//http://military.wikia.com/wiki/7_Ps_(military_adage)?action=edit
//http://military.wikia.com/wiki/Template:Wikipedia
//http://military.wikia.com/wiki/Template:French_wikipedia
//http://military.wikia.com/wiki/List_of_United_States_Navy_SEALs?action=edit
//http://military.wikia.com/wiki/Template:Better than Wikipedia
//??? http://military.wikia.com/wiki/Template:Wikipedia_link

//marvel:
//only in section links and references: https://marvel.fandom.com/wiki/United_States_of_America

//EverQuest 2: no

//wookiepedia:
//https://starwars.fandom.com/wiki/Kazuda_Xiono?action=edit
//https://starwars.fandom.com/wiki/Template:Interlang
//https://starwars.fandom.com/wiki/Cad_Bane/Legends vs: https://starwars.fandom.com/wiki/Cad_Bane

//yu gi oh : no
//dr who: https://tardis.fandom.com/wiki/First_Doctor vs. https://dwlegacy.fandom.com/wiki/The_First_Doctor

// The Ice Hockey Wiki
// https://icehockey.fandom.com/wiki/National_Hockey_League?action=edit
// https://icehockey.fandom.com/wiki/Template:Wikipedia

// star wars
//https://swfanon.fandom.com/wiki/Special:Search?query=Luke+Skywalker

// memory beta:
//https://memory-beta.fandom.com/wiki/Star_Trek:_The_Next_Generation
//https://memory-beta.fandom.com/wiki/Generations?action=edit
//https://memory-beta.fandom.com/wiki/Template:Wikipedia

//memory alpha:
//https://memory-alpha.fandom.com/wiki/Jean-Luc_Picard?action=edit
//https://memory-alpha.fandom.com/wiki/Template:Wikipedia
//https://memory-alpha.fandom.com/wiki/Template:Wikipedia-title
//mbeta-title

//American Football Database:
// https://americanfootballdatabase.fandom.com/wiki/1890_Greensburg_Athletic_Association_season?veaction=edit
// after template

//disney:
// https://disney.fandom.com/wiki/Mickey_Mouse?action=edit
// https://disney.fandom.com/wiki/Template:WP

//lego:
// https://lego.fandom.com/wiki/Hogwarts_Castle
}
if(isWikipedia && wikipediaTitle.nonEmpty){
quads += quad(subjectUri, dbpediaNamespace.append(wikipediaTitle), node.sourceIri)
}
quads
}

private def collectTemplatesTopLevel(node: Node): List[TemplateNode] = {
node match {
case templateNode: TemplateNode => List(templateNode)
case _ => node.children.flatMap(collectTemplatesTopLevel)
}
}

private def collectSections(node: Node): List[SectionNode] = {
node match {
case sectionNode: SectionNode => List(sectionNode)
case _ => node.children.flatMap(collectSections)
}
}

private def getNormalizedOptionalNodeValueOrDefault(n:Option[PropertyNode], defaultValue:String): String = {
n match {
case Some(x) => return getNormalizeNodeValueOrDefault(x, defaultValue)
case None => return defaultValue
}
}

private def getNormalizeNodeValueOrDefault(n:PropertyNode, defaultValue:String): String = {
val normalizedNodeValue = normalizeString(n.propertyNodeValueToPlainText)
if(normalizedNodeValue.isEmpty)
return defaultValue
else
return normalizedNodeValue
}

private def normalizeString(txt:String): String = {
return txt.replace("\n", "").replace("\r", "").trim
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ package org.dbpedia.extraction.mappings

import de.fau.cs.osr.ptk.common.AstPrinter
import java.io.PrintWriter

import org.dbpedia.extraction.annotations.ExtractorAnnotation
import org.dbpedia.extraction.config.Config
import org.dbpedia.extraction.config.provenance.DBpediaDatasets
import org.dbpedia.extraction.nif.{TextConvert, WikipediaNifExtractor}
import org.dbpedia.extraction.ontology.{Ontology, OntologyProperty}
import org.dbpedia.extraction.transform.Quad
import org.dbpedia.extraction.transform.{Quad, QuadBuilder}
import org.dbpedia.extraction.util.{Language, WikiSettings}
import org.dbpedia.extraction.wikiparser._
import org.sweble.wikitext.engine._
Expand All @@ -21,6 +22,8 @@ import org.sweble.wikitext.parser.nodes.{WtNode, WtUrl}
import org.apache.commons.lang3.StringEscapeUtils
import org.sweble.wikitext.engine.nodes.{EngPage, EngProcessedPage}

import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.language.reflectiveCalls

Expand All @@ -44,7 +47,7 @@ class NifSwebleExtractor(
)
extends WikiPageExtractor
{
override val datasets = Set(DBpediaDatasets.NifContext,DBpediaDatasets.NifPageStructure,DBpediaDatasets.NifTextLinks,DBpediaDatasets.LongAbstracts, DBpediaDatasets.ShortAbstracts, DBpediaDatasets.RawTables, DBpediaDatasets.Equations)
override val datasets = Set(DBpediaDatasets.NifContext,DBpediaDatasets.NifPageStructure,DBpediaDatasets.NifTextLinks,DBpediaDatasets.LongAbstracts, DBpediaDatasets.ShortAbstracts, DBpediaDatasets.RawTables, DBpediaDatasets.Equations, DBpediaDatasets.InterWikiLinks)

var config: WikiConfig = getSwebleConfig()
var engine = new WtEngineImpl(config)
Expand All @@ -55,6 +58,9 @@ class NifSwebleExtractor(
protected lazy val shortProperty: OntologyProperty = context.ontology.properties(context.configFile.abstractParameters.shortAbstractsProperty)
protected lazy val longProperty: OntologyProperty = context.ontology.properties(context.configFile.abstractParameters.longAbstractsProperty)

val wikiPageWikiLinkProperty = context.ontology.properties("wikiPageInterWikiLink")
private val interWikiQuad = QuadBuilder.apply(context.language, DBpediaDatasets.InterWikiLinks, wikiPageWikiLinkProperty, null) _

def getSwebleConfig(): WikiConfig = {
//https://github.com/sweble/sweble-wikitext/blob/develop/sweble-wikitext-components-parent/swc-engine/src/main/java/org/sweble/wikitext/engine/utils/LanguageConfigGenerator.java

Expand Down Expand Up @@ -102,9 +108,23 @@ class NifSwebleExtractor(

var quads = new ArrayBuffer[Quad]()


scala.util.control.Exception.ignoring(classOf[Exception]) {
val page = engine.postprocess(pageId, source,new MyExpansionCallback(context.templates)).getPage

var linkExtract = new SwebleLinkExtractor()
linkExtract.go(page)
for (link <- linkExtract.internalLinks){
try
{
val destinationTitle = WikiTitle.parse(link, context.language)
if(destinationTitle.language != context.language && destinationTitle.namespace.code == 0) {
quads += interWikiQuad(subjectUri, destinationTitle.resourceIri, pageNode.sourceIri)
//println(destinationTitle)
}
}
catch { case _: Throwable => }
}

//new PrintWriter(pageNode.title.decoded + "_ast_expansion") { write(AstPrinter.print[WtNode](page)); close }
var html = HtmlRenderer.print(new MyRendererCallback, config, pageTitle, page)
html = StringEscapeUtils.unescapeXml(html)
Expand All @@ -131,6 +151,34 @@ class NifSwebleExtractor(
}
}

import de.fau.cs.osr.ptk.common.AstVisitor
import org.sweble.wikitext.parser.nodes.WtNode
import org.sweble.wikitext.parser.nodes.WtExternalLink
import org.sweble.wikitext.parser.nodes.WtInternalLink

class SwebleLinkExtractor() extends AstVisitor[WtNode] {

val internalLinks = mutable.MutableList[String]()

def visit(n: WtNode): Unit = { // Fallback for all nodes that are not explicitly handled below
iterate(n)
}

/*
def visit(link: WtExternalLink): Unit = {
println(link)
} */

def visit(link: WtInternalLink): Unit = {
//var prefix = link.getPrefix
//var resolved = link.getTarget.isResolved
var target = link.getTarget.getAsString
internalLinks += target
//println(target)
}

}

final private class MyExpansionCallback(templates : Template) extends ExpansionCallback {
override def retrieveWikitext(expansionFrame: ExpansionFrame, pageTitle: PageTitle): FullPage =templates.getFullPage(pageTitle.getTitle)
override def fileUrl(pageTitle: PageTitle, width: Int, height: Int): String = ""
Expand Down
Loading

0 comments on commit 48d1845

Please sign in to comment.