Skip to content

Commit

Permalink
better inter wiki link extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
sven-h committed Apr 8, 2019
1 parent 48d1845 commit 3de99cc
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 29 deletions.
35 changes: 17 additions & 18 deletions core/src/main/java/org/dbpedia/extraction/nif/LinkExtractor.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package org.dbpedia.extraction.nif;

import org.apache.commons.lang3.StringEscapeUtils;
import org.dbpedia.extraction.util.Language;
import org.dbpedia.extraction.wikiparser.WikiTitle;
import org.dbpedia.iri.UriUtils;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeVisitor;
Expand All @@ -21,10 +23,12 @@ public class LinkExtractor implements NodeVisitor {
private boolean invisible = false;
private NifExtractorContext context;
private ArrayList<String> errors = new ArrayList<>();
private Language fullLang;

public LinkExtractor(NifExtractorContext context) {
public LinkExtractor(NifExtractorContext context, Language fullLang) {
paragraphs = new ArrayList<Paragraph>();
this.context = context;
this.fullLang = fullLang;
}

/**
Expand Down Expand Up @@ -77,24 +81,10 @@ public void head(Node node, int depth) {
} else if(node.nodeName().equals("a")) {
String link = node.attr("href");
//remove internal links linking to mediawiki meta pages. Also removes links that contain ":".
if (link.contains("mediawiki") && !link.contains(":")) {
if (link.contains("mediawiki")) {
tempLink = new Link();
String uri = cleanLink(node.attr("href"), false);
setUri(uri);
} else if (link.contains("mediawiki") && link.contains(":")) {

if (!node.childNodes().isEmpty()) {
if (node.childNode(0).nodeName().equals("#text") &&
node.childNode(0).toString().contains(":") &&
!node.childNode(0).toString().contains("http")) {
tempLink = new Link();
String uri = cleanLink(node.attr("href"), false);
setUri(uri);
}
} else {
skipLevel = depth;
}

} else if (node.attr("class").equals("external text")) {
//don't skip external links
tempLink = new Link();
Expand Down Expand Up @@ -145,8 +135,17 @@ private void setUri(String uri) {
private String cleanLink(String uri, boolean external) {
if(!external) {

uri = this.context.resource.substring(0, this.context.resource.indexOf("/resource/") + 10) + uri.substring(uri.indexOf("mediawiki/")+10);
uri = uri.replace("&action=edit&redlink=1", "");
uri = uri.replace("&action=edit&redlink=1", "");
WikiTitle destinationTitle = WikiTitle.parse(uri.substring(uri.indexOf("mediawiki/")+10), this.fullLang);

if(destinationTitle.fragment() == null){
return destinationTitle.resourceIri();
}else{
return destinationTitle.resourceIri() + "#" + destinationTitle.fragment();
}

//uri = this.context.resource.substring(0, this.context.resource.indexOf("/resource/") + 10) + uri.substring(uri.indexOf("mediawiki/")+10);
//uri = uri.replace("&action=edit&redlink=1", "");

} else {
//there are links that contain illegal hostnames
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.language.reflectiveCalls
import scala.util.matching.Regex

/**
* Extracts page html.
Expand Down Expand Up @@ -53,6 +54,8 @@ class NifSwebleExtractor(
var engine = new WtEngineImpl(config)
//var textConverter =

protected val removeThumbLinks: Regex = "(\\[\\[.*)\\|thumb([\\|]?.*\\]\\])".r

protected val shortAbstractLength: Int = context.configFile.abstractParameters.shortAbstractMinLength

protected lazy val shortProperty: OntologyProperty = context.ontology.properties(context.configFile.abstractParameters.shortAbstractsProperty)
Expand Down Expand Up @@ -104,7 +107,8 @@ class NifSwebleExtractor(
val pageTitle = PageTitle.make(config, pageNode.title.decodedWithNamespace)
val pageId = new PageId(pageTitle, pageNode.id)

val source = StringEscapeUtils.unescapeXml(pageNode.source)
var source = StringEscapeUtils.unescapeXml(pageNode.source)
source = removeThumbLinks.replaceAllIn(source, "")//"$1$2") //replace all links with thumb

var quads = new ArrayBuffer[Quad]()

Expand All @@ -117,8 +121,13 @@ class NifSwebleExtractor(
try
{
val destinationTitle = WikiTitle.parse(link, context.language)
if(destinationTitle.language != context.language && destinationTitle.namespace.code == 0) {
quads += interWikiQuad(subjectUri, destinationTitle.resourceIri, pageNode.sourceIri)
if(destinationTitle.language != context.language && destinationTitle.namespace.code == 0 ) {
if(destinationTitle.fragment == null){
quads += interWikiQuad(subjectUri, destinationTitle.resourceIri, pageNode.sourceIri)
}else{
quads += interWikiQuad(subjectUri, destinationTitle.resourceIri + "#" + destinationTitle.fragment, pageNode.sourceIri)
}

//println(destinationTitle)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import org.dbpedia.extraction.nif.Paragraph.HtmlString
import org.dbpedia.extraction.ontology.RdfNamespace
import org.dbpedia.extraction.transform.{Quad, QuadBuilder}
import org.dbpedia.extraction.config.Config.NifParameters
import org.dbpedia.extraction.util.{CssConfigurationMap, RecordSeverity}
import org.dbpedia.extraction.util.{CssConfigurationMap, Language, RecordSeverity}
import org.dbpedia.iri.UriUtils
import org.jsoup.Jsoup
import org.jsoup.nodes.{Document, Element, TextNode}
Expand All @@ -22,7 +22,7 @@ import scala.util.{Failure, Success, Try}
/**
* Created by Chile on 1/19/2017.
*/
abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifParameters : NifParameters) {
abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifParameters : NifParameters, fullLang: Language) {

assert(nifContextIri.contains("?"), "the nifContextIri needs a query part!")

Expand Down Expand Up @@ -299,7 +299,7 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
val element = new Element(Tag.valueOf("div"), "")
pageSection.content.foreach(element.appendChild)

val extractor: LinkExtractor = new LinkExtractor(extractionContext)
val extractor: LinkExtractor = new LinkExtractor(extractionContext, this.fullLang)
val traversor: NodeTraversor = new NodeTraversor(extractor)
traversor.traverse(element)
if (extractor.getParagraphs.size() > 0){
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ class WikipediaNifExtractor(
) extends HtmlNifExtractor(
wikiPage.uri + "?dbpv=" + context.configFile.dbPediaVersion + "&nif=context",
context.language.isoCode,
context.configFile.nifParameters
context.configFile.nifParameters,
context.language
) {

/**
Expand Down
39 changes: 35 additions & 4 deletions core/src/main/scala/org/dbpedia/extraction/util/Language.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import java.util.{Locale, MissingResourceException}

import org.dbpedia.extraction.ontology.{DBpediaNamespace, RdfNamespace}

import scala.collection.immutable.HashSet
import scala.collection.mutable.Map
import scala.collection.mutable.HashMap

Expand Down Expand Up @@ -70,7 +71,7 @@ object Language extends (String => Language)
val wikipediaLanguageUrl = "https://noc.wikimedia.org/conf/langlist"


val wikiLanguageCodes = List("aa","ab","ace","ady","af","ak","als","am","an","ang","ar","arc","arz","as","ast","atj","av","ay","az","azb",
val wikiLanguageCodes = HashSet("aa","ab","ace","ady","af","ak","als","am","an","ang","ar","arc","arz","as","ast","atj","av","ay","az","azb",
"ba","bar","bat-smg","bcl","be","be-tarask","bg","bh","bi","bjn","bm","bn","bo","bpy","br","bs","bug","bxr",
"ca","cbk-zam","cdo","ce","ceb","ch","cho","chr","chy","ckb","co","cr","crh","cs","csb","cu","cv","cy",
"da","de","din","diq","dsb","dty","dv","dz","ee","el","eml","en","eo","es","et","eu","ext",
Expand Down Expand Up @@ -252,16 +253,46 @@ object Language extends (String => Language)
* Gets a language object for a Wikipedia language code.
* Throws IllegalArgumentException if language code is unknown.
*/
def apply(code: String) : Language = map.getOrElse(code, throw new IllegalArgumentException("unknown language code "+code))
def apply(code: String) : Language = checkCodeForInterWikiLink(code) match {
case Some(x) => return x
case scala.None => map.getOrElse(code, throw new IllegalArgumentException("unknown language code "+code))
}

/**
* Gets a language object for a Wikipedia language code, or None if given code is unknown.
*/
def get(code: String) : Option[Language] = map.get(code)
def get(code: String) : Option[Language] = checkCodeForInterWikiLink(code) match {
case Some(x) => return Some(x)
case scala.None => map.get(code)
}


/**
* Gets a language object for a Wikipedia language code, or the default if the given code is unknown.
*/
def getOrElse(code: String, default: => Language) : Language = map.getOrElse(code, default)
def getOrElse(code: String, default: => Language) : Language = checkCodeForInterWikiLink(code) match {
case Some(x) => return x
case scala.None => map.getOrElse(code, default)
}




def checkCodeForInterWikiLink(code: String) : Option[Language] = {
if(code.startsWith("w:c:")){
var base = code.substring(4)
var splits = base.split("\\.")
var language = "en"
if(splits.length > 1){
if(wikiLanguageCodes.contains(splits(0))){
language = splits(0)
}
}
base = base + ".wikia.com"
return Some(makeDbkwikLanguage(language, base, false, ""))
}
return scala.None
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,14 @@ object WikiTitle
// FIXME: use interwiki prefixes from WikiSettingsDownloader.scala, e.g. [[q:Foo]] links to wikiquotes

var parts = decoded.split(":", -1)
if(decoded.startsWith("w:c:")){ //see https://community.fandom.com/wiki/Help:Interwiki_link
parts = decoded.substring(4).split(":", -1)
if (parts.length > 0){
parts(0) = "w:c:" + parts(0)
}
}



var leadingColon = false
var isInterLanguageLink = false
Expand Down

0 comments on commit 3de99cc

Please sign in to comment.