From 6ec9abe3b9cd9dab38dabb1bc12014655b7328b2 Mon Sep 17 00:00:00 2001 From: Sven Hertling Date: Wed, 5 Jun 2019 12:21:05 +0200 Subject: [PATCH] added title of section to the extraction --- .../extraction/nif/GeneralNifExtractor.scala | 18 +++++++++++++++++- .../nif/NifExtractionAstVisitor.scala | 12 ++++++++---- .../dbpedia/extraction/nif/NifSection.scala | 2 ++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/GeneralNifExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/nif/GeneralNifExtractor.scala index 982e605c52..260fdccce1 100644 --- a/core/src/main/scala/org/dbpedia/extraction/nif/GeneralNifExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/nif/GeneralNifExtractor.scala @@ -36,6 +36,7 @@ class GeneralNifExtractor ( val wikiPageExternalLinkProperty = context.ontology.properties("wikiPageExternalLink") val wikiPageInterWikiLinkProperty = context.ontology.properties("wikiPageInterWikiLink") val wikiPageInterLanguageLinkProperty = context.ontology.properties("wikiPageInterLanguageLink") + val labelProperty = context.ontology.properties("rdfs:label") @@ -69,7 +70,7 @@ class GeneralNifExtractor ( private def writeLongAndShortAbstract(section: NifSection, text:String):ArrayBuffer[Quad] = { var quads = ArrayBuffer[Quad]() if (recordAbstracts && section.id == "abstract" && text.length > 0) { - val describingParagraphs = getParagraphsDescribingConcept(section, text) + val describingParagraphs = section.paragraphs//getParagraphsDescribingConcept(section, text) if(describingParagraphs.size > 0){ quads += longQuad(wikiPage.uri, text.substring(describingParagraphs.head.begin.getOrElse(0), describingParagraphs.last.end.getOrElse(0)), sourceUrl) //text.substring(section.begin.getOrElse(0), section.end.getOrElse(0)), sourceUrl) quads += shortQuad(wikiPage.uri, getShortAbstract(describingParagraphs, text), sourceUrl) // getShortAbstract(section.paragraphs, text), sourceUrl) @@ -184,6 +185,21 @@ class GeneralNifExtractor ( if (section.next.isEmpty) quads += nifStructure(topSectionUri, RdfNamespace.NIF.append("lastSection"), sectionUri, sourceUrl, null) + //adding title + if(section.beginTitle.nonEmpty && section.endTitle.nonEmpty){ + val titleUri = getNifIri("title", section.beginTitle.get, section.endTitle.get) + quads += nifStructure(titleUri, RdfNamespace.RDF.append("type"), RdfNamespace.NIF.append("Title"), sourceUrl, null) + quads += nifStructure(titleUri, RdfNamespace.NIF.append("referenceContext"), nifContextUri, sourceUrl, null) + quads += nifStructure(titleUri, RdfNamespace.NIF.append("beginIndex"), section.beginTitle.get.toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger")) + quads += nifStructure(titleUri, RdfNamespace.NIF.append("endIndex"), section.endTitle.get.toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger")) + quads += nifStructure(titleUri, RdfNamespace.NIF.append("superString"), sectionUri, sourceUrl, null) + if(writeLinkAnchors){ + quads += nifStructure(titleUri, RdfNamespace.NIF.append("anchorOf"), section.id, sourceUrl, RdfNamespace.XSD.append("string")) + quads += nifStructure(sectionUri, labelProperty.uri, section.id.trim, sourceUrl, RdfNamespace.XSD.append("string")) + } + + } + quads } diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/NifExtractionAstVisitor.scala b/core/src/main/scala/org/dbpedia/extraction/nif/NifExtractionAstVisitor.scala index dfc9a50863..7274de55a2 100644 --- a/core/src/main/scala/org/dbpedia/extraction/nif/NifExtractionAstVisitor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/nif/NifExtractionAstVisitor.scala @@ -17,7 +17,7 @@ class NifExtractionAstVisitor(language : Language) private var currentSection = new ListBuffer[Int]() private var context = StringBuilder.newBuilder //contains the whole text of the wikipage - private var nifSection: NifSection = new NifSection(id = "abstract",ref = "", prev = None, next = None, top = None, sub = None, begin = Some(0), end = None, paragraphs = ListBuffer()) + private var nifSection: NifSection = new NifSection(id = "abstract",ref = "", prev = None, next = None, top = None, sub = None, begin = Some(0), end = None, beginTitle = None, endTitle = None, paragraphs = ListBuffer()) private var nifParagraph: NifParagraph = new NifParagraph(begin = Some(0), end = None,links = ListBuffer()) private var extLinkNum :Int = 1 @@ -42,6 +42,8 @@ class NifExtractionAstVisitor(language : Language) sub = None, begin = Some(0), end = None, + beginTitle = None, + endTitle = None, paragraphs = ListBuffer() ) nifSection = abstractSection @@ -256,6 +258,8 @@ class NifExtractionAstVisitor(language : Language) sub = None, begin = None, end = None, + beginTitle = None, + endTitle = None, paragraphs = ListBuffer() ) section.top match{ @@ -282,10 +286,10 @@ class NifExtractionAstVisitor(language : Language) tocMap.append(nifSection) //closeParagraphAndStartNew() nifParagraph = new NifParagraph(begin = Some(0),end = Some(0),links = ListBuffer()) //dummy paragraph (all lnks in header will be put in dummy) - val startHeading = context.length + nifSection.beginTitle = Some(context.length) iterate(s.getHeading) - val endHeading = context.length - nifSection.id = context.substring(startHeading, endHeading) + nifSection.endTitle = Some(context.length) + nifSection.id = context.substring(nifSection.beginTitle.get, nifSection.endTitle.get) closeParagraphAndStartNew() } diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/NifSection.scala b/core/src/main/scala/org/dbpedia/extraction/nif/NifSection.scala index 31a5f76f82..92d4a851f0 100644 --- a/core/src/main/scala/org/dbpedia/extraction/nif/NifSection.scala +++ b/core/src/main/scala/org/dbpedia/extraction/nif/NifSection.scala @@ -11,6 +11,8 @@ class NifSection( var sub: Option[NifSection], var begin: Option[Int], var end: Option[Int], + var beginTitle: Option[Int], + var endTitle: Option[Int], var paragraphs: ListBuffer[NifParagraph] ) { def addParagraph(nifparagraph: NifParagraph): Unit = paragraphs += nifparagraph