forked from dbpedia/extraction-framework
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
updated template class extraction updated abstract extraction
- Loading branch information
Showing
13 changed files
with
1,079 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
111 changes: 111 additions & 0 deletions
111
core/src/main/java/org/dbpedia/extraction/hearst/CustomPattern.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
package org.dbpedia.extraction.hearst; | ||
|
||
import java.util.regex.Pattern; | ||
|
||
public class CustomPattern { | ||
|
||
private String pid; | ||
private String regex; | ||
private String type; | ||
private Pattern pattern; | ||
private String preCondition; | ||
private Boolean excludePronouns; | ||
private String firstKeyWord; | ||
private String secondKeyWord; | ||
private Boolean instanceFirst; | ||
|
||
private String surrounderSymbols = "[\\u0027\\u2018\\u2019\\u201A\\u201B\\u201C\\u201D\\u201E\\u201F\\u0022]?"; | ||
private String endSymbols = "[\"\\u0026\\u0027\\u2018\\u2019\\u201A\\u201B\\u201C\\u201D\\u201E\\u201F\\u00A9\\u00AE]?"; //includes surrounderSymbols as well! | ||
private String prefix = "(\\p{L}|\\d)" + endSymbols; | ||
private String suffix = surrounderSymbols + "(\\p{L}|\\d)"; | ||
|
||
public CustomPattern(String pid, String regex, String type, Boolean instanceFirst) { | ||
this.pid = pid; | ||
this.regex = regex; | ||
this.type = type; | ||
this.instanceFirst = instanceFirst; | ||
|
||
//Configure the Prefix and suffix of the regex | ||
if (type.equals("compact") || type.equals("split")) { | ||
this.pattern = Pattern.compile(prefix + regex + suffix); | ||
} | ||
|
||
if (type.equals("split_noPrefix")) { | ||
this.pattern = Pattern.compile("(?>" + regex + suffix + ")"); | ||
} | ||
|
||
if (type.equals("split_noSuffix")) { | ||
this.pattern = Pattern.compile("(?>" + prefix + regex + ")"); | ||
} | ||
} | ||
|
||
public CustomPattern(String pid, String regex, String type, String preCond, Boolean instanceFirst) { | ||
this.pid = pid; | ||
this.regex = regex; | ||
this.type = type; | ||
this.instanceFirst = instanceFirst; | ||
|
||
//Configure the Prefix and suffix of the regex | ||
if (type.equals("compact") || type.equals("split")) { | ||
this.pattern = Pattern.compile(prefix + regex + suffix); | ||
} | ||
|
||
if (type.equals("split_noPrefix")) { | ||
this.pattern = Pattern.compile("(?>" + regex + suffix + ")"); | ||
} | ||
|
||
if (type.equals("split_noSuffix")) { | ||
this.pattern = Pattern.compile("(?>" + prefix + regex + ")"); | ||
} | ||
|
||
this.preCondition = preCond; | ||
} | ||
|
||
public CustomPattern(String pid, String regex, String type, String preCond, String fkw, String skw, Boolean instanceFirst) { | ||
this.pid = pid; | ||
this.regex = regex; | ||
this.type = type; | ||
this.firstKeyWord = fkw; | ||
this.secondKeyWord = skw; | ||
this.instanceFirst = instanceFirst; | ||
|
||
//Configure the Prefix and suffix of the regex | ||
if (type.equals("compact") || type.equals("split")) { | ||
this.pattern = Pattern.compile(prefix + regex + suffix); | ||
} | ||
|
||
if (type.equals("split_noPrefix")) { | ||
this.pattern = Pattern.compile("(?>" + regex + suffix + ")"); | ||
} | ||
|
||
if (type.equals("split_noSuffix")) { | ||
this.pattern = Pattern.compile("(?>" + prefix + regex + ")"); | ||
} | ||
|
||
this.preCondition = preCond; | ||
} | ||
|
||
public Pattern getPattern() { | ||
return pattern; | ||
} | ||
|
||
public String getType() { | ||
return type; | ||
} | ||
|
||
public String getFirstKeyWord() { | ||
return firstKeyWord; | ||
} | ||
|
||
public String getSecondKeyWord() { | ||
return secondKeyWord; | ||
} | ||
|
||
public Boolean getInstanceFirst() { | ||
return instanceFirst; | ||
} | ||
|
||
|
||
|
||
|
||
} |
648 changes: 648 additions & 0 deletions
648
core/src/main/java/org/dbpedia/extraction/hearst/ExtractHearstPatterns.java
Large diffs are not rendered by default.
Oops, something went wrong.
71 changes: 71 additions & 0 deletions
71
core/src/main/java/org/dbpedia/extraction/hearst/IsaPattern.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
package org.dbpedia.extraction.hearst; | ||
|
||
import edu.stanford.nlp.ling.TaggedWord; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
|
||
public class IsaPattern { | ||
private ArrayList<NounPhrase> instance; | ||
private ArrayList<NounPhrase> clazz; | ||
|
||
public IsaPattern(ArrayList<NounPhrase> instance, ArrayList<NounPhrase> clazz) { | ||
this.instance = (ArrayList<NounPhrase>)instance.clone(); | ||
this.clazz = (ArrayList<NounPhrase>)clazz.clone(); | ||
} | ||
|
||
public ArrayList<NounPhrase> getInstance() { | ||
return instance; | ||
} | ||
|
||
public ArrayList<NounPhrase> getClazz() { | ||
return clazz; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return nounPhraseListUnderscoreToString(instance) + " --isa--> " + nounPhraseListUnderscoreToString(clazz); | ||
} | ||
|
||
|
||
private static String nounPhraseListToString(ArrayList<NounPhrase> nps) { | ||
if (nps.size() == 0) { | ||
return "{}"; | ||
} | ||
StringBuilder result = new StringBuilder(); | ||
result.append("{"); | ||
for (NounPhrase np : nps) { | ||
result.append(np.toString()).append("|"); | ||
} | ||
result.setLength(result.length() - 1); | ||
result.append("}"); | ||
return result.toString(); | ||
} | ||
|
||
private static String nounPhraseListUnderscoreToString(ArrayList<NounPhrase> nps) { | ||
if (nps.size() == 0) { | ||
return "{}"; | ||
} | ||
StringBuilder result = new StringBuilder(); | ||
result.append("{"); | ||
for (NounPhrase np : nps) { | ||
for (TaggedWord tw : np.getPreModifier()) { | ||
result.append(tw.word()).append(" "); | ||
} | ||
result.append("_"); | ||
result.append(np.getNPCore().word()); | ||
result.append("_"); | ||
for (TaggedWord tw : np.getPostModifier()) { | ||
result.append(tw.word()).append(" "); | ||
} | ||
//if (result.length() > 0) { | ||
// result.setLength(result.length() - 1); | ||
//} | ||
|
||
result.append("|"); | ||
} | ||
result.setLength(result.length() - 1); | ||
result.append("}"); | ||
return result.toString(); | ||
} | ||
} |
139 changes: 139 additions & 0 deletions
139
core/src/main/java/org/dbpedia/extraction/hearst/NounPhrase.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
package org.dbpedia.extraction.hearst; | ||
|
||
import java.util.ArrayList; | ||
|
||
import edu.stanford.nlp.ling.TaggedWord; | ||
import java.util.StringJoiner; | ||
|
||
public class NounPhrase { | ||
|
||
public TaggedWord NPCore; | ||
private ArrayList<TaggedWord> preModifier, postModifier; | ||
private boolean isComplete; | ||
private boolean coreFound; | ||
private int maxNPLength; | ||
|
||
public NounPhrase(int maxNPLength) { | ||
this.maxNPLength = maxNPLength; | ||
|
||
isComplete = false; | ||
preModifier = new ArrayList<TaggedWord>(); | ||
postModifier = new ArrayList<TaggedWord>(); | ||
} | ||
|
||
public void addPreModifier(TaggedWord tw) { | ||
preModifier.add(0, tw); | ||
if (preModifier.size() == maxNPLength) { | ||
preModifier.remove(preModifier.size() - 1); | ||
} | ||
} | ||
|
||
public void addPostModifier(TaggedWord tw) { | ||
postModifier.add(tw); | ||
if (postModifier.size() + 1 + preModifier.size() > maxNPLength) { | ||
if (preModifier.size() > 0) { | ||
preModifier.remove(0); | ||
} else { | ||
isComplete = true; | ||
} | ||
} | ||
} | ||
|
||
public void NPCoreToPost(TaggedWord tw) { | ||
postModifier.add(NPCore); | ||
NPCore = tw; | ||
if (postModifier.size() + 1 == maxNPLength) { | ||
isComplete = true; | ||
} | ||
} | ||
|
||
public void clearPreMod() { | ||
preModifier.clear(); | ||
} | ||
|
||
public void clearPostMod() { | ||
postModifier.clear(); | ||
} | ||
|
||
public void setNPCore(TaggedWord tw) { | ||
NPCore = tw; | ||
if (postModifier.size() + 1 == maxNPLength) { | ||
isComplete = true; | ||
} | ||
} | ||
|
||
public TaggedWord getNPCore() { | ||
return NPCore; | ||
} | ||
|
||
public ArrayList<TaggedWord> getPreModifier() { | ||
return preModifier; | ||
} | ||
|
||
public ArrayList<TaggedWord> getPostModifier() { | ||
return postModifier; | ||
} | ||
|
||
public String getPreModifierText() { | ||
StringJoiner joiner = new StringJoiner(" "); | ||
for (TaggedWord tw : preModifier) { | ||
joiner.add(tw.word()); | ||
} | ||
return joiner.toString(); | ||
} | ||
|
||
public String getPostModifierText() { | ||
StringJoiner joiner = new StringJoiner(" "); | ||
for (TaggedWord tw : postModifier) { | ||
joiner.add(tw.word()); | ||
} | ||
return joiner.toString(); | ||
} | ||
|
||
public String getNPCoreText() { | ||
return NPCore.word(); | ||
} | ||
|
||
public int getMaxNPLength() { | ||
return maxNPLength; | ||
} | ||
|
||
public boolean isCoreFound() { | ||
return coreFound; | ||
} | ||
|
||
public boolean isComplete() { | ||
return isComplete; | ||
} | ||
|
||
|
||
public String toString() { | ||
StringBuilder sb = new StringBuilder(); | ||
for (TaggedWord tw : preModifier) { | ||
sb.append(tw.word()).append(" "); | ||
} | ||
sb.append(NPCore.word()).append(" "); | ||
for (TaggedWord tw : postModifier) { | ||
sb.append(tw.word()).append(" "); | ||
} | ||
if (sb.length() > 0) { | ||
sb.setLength(sb.length() - 1); | ||
} | ||
return sb.toString(); | ||
} | ||
|
||
public String tagsToString() { | ||
StringBuilder sb = new StringBuilder(); | ||
for (TaggedWord tw : preModifier) { | ||
sb.append(tw.tag()).append(" "); | ||
} | ||
sb.append(NPCore.tag()).append(" "); | ||
for (TaggedWord tw : postModifier) { | ||
sb.append(tw.tag()).append(" "); | ||
} | ||
if (sb.length() > 0) { | ||
sb.setLength(sb.length() - 1); | ||
} | ||
return sb.toString(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.