Skip to content

Commit

Permalink
added hearst pattern extraction
Browse files Browse the repository at this point in the history
updated template class extraction
updated abstract extraction
  • Loading branch information
sven-h committed Nov 12, 2019
1 parent c1a8857 commit 2f3899e
Show file tree
Hide file tree
Showing 13 changed files with 1,079 additions and 31 deletions.
13 changes: 13 additions & 0 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,19 @@
<version>2.1.0</version>
<type>pom</type>
</dependency-->

<!--extracting hearst patterns -->
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.9.2</version>
</dependency>
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.9.2</version>
<classifier>models-english</classifier>
</dependency>

<dependency>
<groupId>org.wikidata.wdtk</groupId>
Expand Down
111 changes: 111 additions & 0 deletions core/src/main/java/org/dbpedia/extraction/hearst/CustomPattern.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package org.dbpedia.extraction.hearst;

import java.util.regex.Pattern;

public class CustomPattern {

private String pid;
private String regex;
private String type;
private Pattern pattern;
private String preCondition;
private Boolean excludePronouns;
private String firstKeyWord;
private String secondKeyWord;
private Boolean instanceFirst;

private String surrounderSymbols = "[\\u0027\\u2018\\u2019\\u201A\\u201B\\u201C\\u201D\\u201E\\u201F\\u0022]?";
private String endSymbols = "[\"\\u0026\\u0027\\u2018\\u2019\\u201A\\u201B\\u201C\\u201D\\u201E\\u201F\\u00A9\\u00AE]?"; //includes surrounderSymbols as well!
private String prefix = "(\\p{L}|\\d)" + endSymbols;
private String suffix = surrounderSymbols + "(\\p{L}|\\d)";

public CustomPattern(String pid, String regex, String type, Boolean instanceFirst) {
this.pid = pid;
this.regex = regex;
this.type = type;
this.instanceFirst = instanceFirst;

//Configure the Prefix and suffix of the regex
if (type.equals("compact") || type.equals("split")) {
this.pattern = Pattern.compile(prefix + regex + suffix);
}

if (type.equals("split_noPrefix")) {
this.pattern = Pattern.compile("(?>" + regex + suffix + ")");
}

if (type.equals("split_noSuffix")) {
this.pattern = Pattern.compile("(?>" + prefix + regex + ")");
}
}

public CustomPattern(String pid, String regex, String type, String preCond, Boolean instanceFirst) {
this.pid = pid;
this.regex = regex;
this.type = type;
this.instanceFirst = instanceFirst;

//Configure the Prefix and suffix of the regex
if (type.equals("compact") || type.equals("split")) {
this.pattern = Pattern.compile(prefix + regex + suffix);
}

if (type.equals("split_noPrefix")) {
this.pattern = Pattern.compile("(?>" + regex + suffix + ")");
}

if (type.equals("split_noSuffix")) {
this.pattern = Pattern.compile("(?>" + prefix + regex + ")");
}

this.preCondition = preCond;
}

public CustomPattern(String pid, String regex, String type, String preCond, String fkw, String skw, Boolean instanceFirst) {
this.pid = pid;
this.regex = regex;
this.type = type;
this.firstKeyWord = fkw;
this.secondKeyWord = skw;
this.instanceFirst = instanceFirst;

//Configure the Prefix and suffix of the regex
if (type.equals("compact") || type.equals("split")) {
this.pattern = Pattern.compile(prefix + regex + suffix);
}

if (type.equals("split_noPrefix")) {
this.pattern = Pattern.compile("(?>" + regex + suffix + ")");
}

if (type.equals("split_noSuffix")) {
this.pattern = Pattern.compile("(?>" + prefix + regex + ")");
}

this.preCondition = preCond;
}

public Pattern getPattern() {
return pattern;
}

public String getType() {
return type;
}

public String getFirstKeyWord() {
return firstKeyWord;
}

public String getSecondKeyWord() {
return secondKeyWord;
}

public Boolean getInstanceFirst() {
return instanceFirst;
}




}

Large diffs are not rendered by default.

71 changes: 71 additions & 0 deletions core/src/main/java/org/dbpedia/extraction/hearst/IsaPattern.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package org.dbpedia.extraction.hearst;

import edu.stanford.nlp.ling.TaggedWord;
import java.util.ArrayList;
import java.util.List;


public class IsaPattern {
private ArrayList<NounPhrase> instance;
private ArrayList<NounPhrase> clazz;

public IsaPattern(ArrayList<NounPhrase> instance, ArrayList<NounPhrase> clazz) {
this.instance = (ArrayList<NounPhrase>)instance.clone();
this.clazz = (ArrayList<NounPhrase>)clazz.clone();
}

public ArrayList<NounPhrase> getInstance() {
return instance;
}

public ArrayList<NounPhrase> getClazz() {
return clazz;
}

@Override
public String toString() {
return nounPhraseListUnderscoreToString(instance) + " --isa--> " + nounPhraseListUnderscoreToString(clazz);
}


private static String nounPhraseListToString(ArrayList<NounPhrase> nps) {
if (nps.size() == 0) {
return "{}";
}
StringBuilder result = new StringBuilder();
result.append("{");
for (NounPhrase np : nps) {
result.append(np.toString()).append("|");
}
result.setLength(result.length() - 1);
result.append("}");
return result.toString();
}

private static String nounPhraseListUnderscoreToString(ArrayList<NounPhrase> nps) {
if (nps.size() == 0) {
return "{}";
}
StringBuilder result = new StringBuilder();
result.append("{");
for (NounPhrase np : nps) {
for (TaggedWord tw : np.getPreModifier()) {
result.append(tw.word()).append(" ");
}
result.append("_");
result.append(np.getNPCore().word());
result.append("_");
for (TaggedWord tw : np.getPostModifier()) {
result.append(tw.word()).append(" ");
}
//if (result.length() > 0) {
// result.setLength(result.length() - 1);
//}

result.append("|");
}
result.setLength(result.length() - 1);
result.append("}");
return result.toString();
}
}
139 changes: 139 additions & 0 deletions core/src/main/java/org/dbpedia/extraction/hearst/NounPhrase.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package org.dbpedia.extraction.hearst;

import java.util.ArrayList;

import edu.stanford.nlp.ling.TaggedWord;
import java.util.StringJoiner;

public class NounPhrase {

public TaggedWord NPCore;
private ArrayList<TaggedWord> preModifier, postModifier;
private boolean isComplete;
private boolean coreFound;
private int maxNPLength;

public NounPhrase(int maxNPLength) {
this.maxNPLength = maxNPLength;

isComplete = false;
preModifier = new ArrayList<TaggedWord>();
postModifier = new ArrayList<TaggedWord>();
}

public void addPreModifier(TaggedWord tw) {
preModifier.add(0, tw);
if (preModifier.size() == maxNPLength) {
preModifier.remove(preModifier.size() - 1);
}
}

public void addPostModifier(TaggedWord tw) {
postModifier.add(tw);
if (postModifier.size() + 1 + preModifier.size() > maxNPLength) {
if (preModifier.size() > 0) {
preModifier.remove(0);
} else {
isComplete = true;
}
}
}

public void NPCoreToPost(TaggedWord tw) {
postModifier.add(NPCore);
NPCore = tw;
if (postModifier.size() + 1 == maxNPLength) {
isComplete = true;
}
}

public void clearPreMod() {
preModifier.clear();
}

public void clearPostMod() {
postModifier.clear();
}

public void setNPCore(TaggedWord tw) {
NPCore = tw;
if (postModifier.size() + 1 == maxNPLength) {
isComplete = true;
}
}

public TaggedWord getNPCore() {
return NPCore;
}

public ArrayList<TaggedWord> getPreModifier() {
return preModifier;
}

public ArrayList<TaggedWord> getPostModifier() {
return postModifier;
}

public String getPreModifierText() {
StringJoiner joiner = new StringJoiner(" ");
for (TaggedWord tw : preModifier) {
joiner.add(tw.word());
}
return joiner.toString();
}

public String getPostModifierText() {
StringJoiner joiner = new StringJoiner(" ");
for (TaggedWord tw : postModifier) {
joiner.add(tw.word());
}
return joiner.toString();
}

public String getNPCoreText() {
return NPCore.word();
}

public int getMaxNPLength() {
return maxNPLength;
}

public boolean isCoreFound() {
return coreFound;
}

public boolean isComplete() {
return isComplete;
}


public String toString() {
StringBuilder sb = new StringBuilder();
for (TaggedWord tw : preModifier) {
sb.append(tw.word()).append(" ");
}
sb.append(NPCore.word()).append(" ");
for (TaggedWord tw : postModifier) {
sb.append(tw.word()).append(" ");
}
if (sb.length() > 0) {
sb.setLength(sb.length() - 1);
}
return sb.toString();
}

public String tagsToString() {
StringBuilder sb = new StringBuilder();
for (TaggedWord tw : preModifier) {
sb.append(tw.tag()).append(" ");
}
sb.append(NPCore.tag()).append(" ");
for (TaggedWord tw : postModifier) {
sb.append(tw.tag()).append(" ");
}
if (sb.length() > 0) {
sb.setLength(sb.length() - 1);
}
return sb.toString();
}
}
14 changes: 13 additions & 1 deletion core/src/main/resources/datasetdefinitions.json
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,13 @@
"traits":"LinkedData, Published",
"desc": "Dataset linking a DBpedia resource to the same resource in other wikis.",
"defaultgraph": "dataset"
},
},
"interwiki_links_link_section": {
"name": "InterWiki Links in Link Section",
"traits":"LinkedData, Published",
"desc": "All links which appears in a link section and linking to another wiki.",
"defaultgraph": "dataset"
},
"interlanguage_links_chapters": {
"name": "Interlanguage Links between DBpedia Chapters",
"traits":"LinkedData, Published",
Expand Down Expand Up @@ -471,6 +477,12 @@
"traits":"LinkedData",
"desc": "This are all equations collected during the NIF extraction, transformed into MathML XML syntax.",
"defaultgraph": "dataset"
},
"hearst_patterns": {
"name": "Hearst Patterns in the short abstract",
"traits":"LinkedData",
"desc": "Hearst Patterns extracted from the short abstract.",
"defaultgraph": "dataset"
}
},
"links":{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ object DBpediaDatasets extends java.io.Serializable
val PageIds: Dataset = datasets("page_ids")
val InterLanguageLinks: Dataset = datasets("interlanguage_links") // Since the inter-language links were moved from Wikipedia to Wikidata, we now extract these links from the Wikidata dump, not from Wikipedia pages.")
val InterWikiLinks: Dataset = datasets("interwiki_links")
val InterWikiLinksLinkSection: Dataset = datasets("interwiki_links_link_section")
val InterLanguageLinksChapter: Dataset = datasets("interlanguage_links_chapters")
val Genders: Dataset = datasets("genders")
val TopicalConcepts: Dataset = datasets("topical_concepts")
Expand Down Expand Up @@ -229,6 +230,7 @@ object DBpediaDatasets extends java.io.Serializable
val NifTextLinks: Dataset = datasets("nif_text_links")
val RawTables: Dataset = datasets("raw_tables")
val Equations: Dataset = datasets("equations")
val HearstPatterns: Dataset = datasets("hearst_patterns")

/**
* Links
Expand Down
Loading

0 comments on commit 2f3899e

Please sign in to comment.