Skip to content
This repository has been archived by the owner on May 29, 2020. It is now read-only.

Some cosmetic changes #27

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/main/scala/chalk/corpora/MascUtil.scala
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ object MascUtil {

def getNodes(doc: Elem) = (doc \\ "node").toSeq.flatMap { nxml =>
val link = (nxml \ "link")
if (!link.isEmpty) {
if (link.nonEmpty) {
val targets = (link.head \ "@targets").toString.split(" ").toSeq
Some(MNode(xmlId(nxml), targets))
} else throw new Exception("Missing link element.") //None OK?
Expand All @@ -291,7 +291,7 @@ object MascUtil {
// Have to go through some pains to make sure we get a POS for every token.
def getPos(anno: MAnnotation) = {
if (anno.features.isDefinedAt("msd")) anno.features("msd")
else if (anno.features.get("kind").getOrElse("") == "urlAddress") "URL"
else if (anno.features.getOrElse("kind", "") == "urlAddress") "URL"
else if (anno.features.isDefinedAt("categor")) anno.features("categor")
else "UNK"
}
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/chalk/lang/eng/PorterStemmer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,10 @@ class PorterStemmer {
def vowelInStem(s: String): Boolean = {
for (i <- 0 to b.length - 1 - s.length) {
if (!cons(i)) {
return true
true
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, this has to be return.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alternatively: !s.slice(0, b.length - s.length).forall(cons)

}
}
return false
false
}

/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
Expand Down
20 changes: 10 additions & 10 deletions src/main/scala/chalk/text/HTML.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,35 +22,35 @@ package chalk.text
*/
object HTML {

val regex = "&#?\\w+;".r;
val regex = "&#?\\w+;".r

/** Unescapes all HTML entities in the given input. */
def unescapeEntitiesIn(input : String) = {
regex.replaceAllIn(input, m => {
val txt = m.group(0);
val txt = m.group(0)
try {
val codepoint = {
if (txt(1) == '#') {
if (txt(2) == 'x') {
Integer.parseInt(txt.substring(3, txt.length-1), 16);
Integer.parseInt(txt.substring(3, txt.length-1), 16)
} else {
Integer.parseInt(txt.substring(2, txt.length-1));
Integer.parseInt(txt.substring(2, txt.length-1))
}
} else {
entities(txt.substring(1, txt.length-1));
entities(txt.substring(1, txt.length-1))
}
}
if (codepoint == '$') {
new String("\\$");
new String("\\$")
} else {
new String(Character.toChars(codepoint));
new String(Character.toChars(codepoint))
}
} catch {
case _:Exception =>
// exception while processing .. append raw input
txt;
txt
}
});
})
}

/** List of HTML entities with their corresponding code points borrowed from pythons htmlentities package. */
Expand Down Expand Up @@ -307,5 +307,5 @@ object HTML {
"ouml" -> 246,
"raquo" -> 187,
"sigma" -> 963
);
)
}
12 changes: 6 additions & 6 deletions src/main/scala/chalk/text/Unicode.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,21 @@ package chalk.text
object Unicode {
private def inRanges(cp : Int, rangeStarts : Array[Int], rangeEnds : Array[Int]) : Boolean = {
if (cp < 0) {
return false;
return false
}

var i = 0;
var i = 0
while (i < rangeStarts.length && cp < rangeStarts(i)) {
i += 1;
i += 1
}
(i < rangeEnds.length) && (cp <= rangeEnds(i));
(i < rangeEnds.length) && (cp <= rangeEnds(i))
}

private val punctuationRangeStarts =
Array(0xFF01,0xFF1A,0xFF3B,0xFF5B,0xFFE0,0xFE10,0xFE30,0x3000,0xFE50,0x2E00,0x0021,0x003A,0x005B,0x007B,0x2000,0x0080,0x00A1,0x00B4,0x00B6,0x00BF,0x00D7,0x00F7).sorted;
Array(0xFF01,0xFF1A,0xFF3B,0xFF5B,0xFFE0,0xFE10,0xFE30,0x3000,0xFE50,0x2E00,0x0021,0x003A,0x005B,0x007B,0x2000,0x0080,0x00A1,0x00B4,0x00B6,0x00BF,0x00D7,0x00F7).sorted

private val punctuationRangeEnds =
Array(0xFF0F,0xFF20,0xFF40,0xFF65,0xFFEE,0xFE1F,0xFE4F,0x303F,0xFE6F,0x2E7F,0x002F,0x003F,0x0060,0x007E,0x206F,0x00FF,0x00B1,0x00B4,0x00BB,0x00BF,0x00D7,0x00F7).sorted;
Array(0xFF0F,0xFF20,0xFF40,0xFF65,0xFFEE,0xFE1F,0xFE4F,0x303F,0xFE6F,0x2E7F,0x002F,0x003F,0x0060,0x007E,0x206F,0x00FF,0x00B1,0x00B4,0x00BB,0x00BF,0x00D7,0x00F7).sorted

/** Returns true if the given unicode code point is punctuation. */
def isPunctuation(cp : Int) = {
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/chalk/text/analyze/CaseFolder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,5 @@ class CaseFolder extends Analyzer {
}

object CaseFolder extends CaseFolder {
override def apply(in: String): String = in.toLowerCase;
override def apply(in: String): String = in.toLowerCase
}
50 changes: 25 additions & 25 deletions src/main/scala/chalk/text/analyze/EnglishWordClassGenerator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -10,60 +10,60 @@ object EnglishWordClassGenerator extends Analyzer with Serializable {
def apply(x: String) = signatureFor(x)

def signatureFor(word: String) = {
val sb = new StringBuilder;
val wlen = word.length();
val numCaps = (word: Seq[Char]).count(_.isUpper);
val hasDigit = word.exists(_.isDigit);
val hasDash = word.contains('-');
val hasLower = numCaps < wlen;
val ch0 = word.charAt(0);
val lowered = word.toLowerCase();
val sb = new StringBuilder
val wlen = word.length()
val numCaps = (word: Seq[Char]).count(_.isUpper)
val hasDigit = word.exists(_.isDigit)
val hasDash = word.contains('-')
val hasLower = numCaps < wlen
val ch0 = word.charAt(0)
val lowered = word.toLowerCase()
if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) {
if (numCaps == 1) {
sb.append("-INITC");
sb.append("-INITC")
} else {
sb.append("-CAPS");
sb.append("-CAPS")
}
} else if (!Character.isLetter(ch0) && numCaps > 0) {
sb.append("-CAPS");
sb.append("-CAPS")
} else if (hasLower) {
sb.append("-LC");
sb.append("-LC")
}

if (hasDigit) {
sb.append("-NUM");
sb.append("-NUM")
}
if (hasDash) {
sb.append("-DASH");
sb.append("-DASH")
}
if (lowered.endsWith("s") && wlen >= 3) {
// here length 3, so you don't miss out on ones like 80s
val ch2 = lowered.charAt(wlen - 2);
val ch2 = lowered.charAt(wlen - 2)
// not -ess suffixes or greek/latin -us, -is
if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') {
sb.append("-s");
}
} else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) {
if (lowered.endsWith("ed")) {
sb.append("-ed");
sb.append("-ed")
} else if (lowered.endsWith("ing")) {
sb.append("-ing");
sb.append("-ing")
} else if (lowered.endsWith("ion")) {
sb.append("-ion");
sb.append("-ion")
} else if (lowered.endsWith("er")) {
sb.append("-er");
sb.append("-er")
} else if (lowered.endsWith("est")) {
sb.append("-est");
sb.append("-est")
} else if (lowered.endsWith("ly")) {
sb.append("-ly");
sb.append("-ly")
} else if (lowered.endsWith("ity")) {
sb.append("-ity");
sb.append("-ity")
} else if (lowered.endsWith("y")) {
sb.append("-y");
sb.append("-y")
} else if (lowered.endsWith("al")) {
sb.append("-al");
sb.append("-al")
}
}
sb.toString;
sb.toString
}
}
2 changes: 1 addition & 1 deletion src/main/scala/chalk/text/analyze/PorterStemmer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ object PorterStemmer extends Stemmer {
def extra(w: String) = {
if (w.endsWith("at") || w.endsWith("bl") || w.endsWith("iz")) w + 'e'
// double consonant:
else if (doublec(w) && !("lsz".contains(w.last))) w.substring(0, w.length - 1);
else if (doublec(w) && !("lsz".contains(w.last))) w.substring(0, w.length - 1)
else if (m(w) == 1 && cvc(w)) w + "e"
else w
}
Expand Down
12 changes: 6 additions & 6 deletions src/main/scala/chalk/text/analyze/WordShapeGenerator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,19 @@ object WordShapeGenerator extends Analyzer with Serializable {
def apply(v1: String) = signatureFor(v1)

def signatureFor(word: String) = {
val result = new StringBuilder(word.length);
var i = 0;
val result = new StringBuilder(word.length)
var i = 0
while (i < word.length) {
val c = word(i);
val x = if (c.isLetter && c.isUpper) 'X' else if (c.isLetter) 'x' else if (c.isDigit) 'd' else c;
val c = word(i)
val x = if (c.isLetter && c.isUpper) 'X' else if (c.isLetter) 'x' else if (c.isDigit) 'd' else c
if (result.length > 1 && (result.last == x) && result(result.length - 2) == x) {
result += 'e'
} else if (result.length > 1 && result.last == 'e' && result(result.length - 2) == x) {
() // nothing
} else {
result += x;
result += x
}
i += 1;
i += 1
}
result.toString
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ package chalk.text.tokenize
case class RegexSearchTokenizer(pattern : String)
extends Tokenizer {
override def apply(doc : String) = new Iterable[String] {
override def iterator = (pattern.r.findAllIn(doc));
override def iterator = (pattern.r.findAllIn(doc))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@ package chalk.text.tokenize
* @author dramage
*/
case class RegexSplitTokenizer(pattern : String) extends Tokenizer {
override def apply(doc : String) = doc.split(pattern);
override def apply(doc : String) = doc.split(pattern)
}

66 changes: 33 additions & 33 deletions src/main/scala/chalk/text/tokenize/SimpleEnglishTokenizer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -31,34 +31,34 @@ import breeze.io.TextReader;
*
* @author dramage
*/
trait SimpleEnglishTokenizer extends Tokenizer;
trait SimpleEnglishTokenizer extends Tokenizer

object SimpleEnglishTokenizer {

def apply() = V1();
def apply() = V1()

/** Version 0 of the SimpleEnglishTokenizer. */
class V0 extends SimpleEnglishTokenizer {
override def apply(in : String) : Iterable[String] = {
var string = in;
string = V0.r1.replaceAllIn(string, "");
string = V0.r2.replaceAllIn(string, "$1 ");
string = V0.r3.replaceAllIn(string, " $1");
string.split("\\s+");
var string = in
string = V0.r1.replaceAllIn(string, "")
string = V0.r2.replaceAllIn(string, "$1 ")
string = V0.r3.replaceAllIn(string, " $1")
string.split("\\s+")
}
}

object V0 {

// delete word-final hyphens when followed by newlines
val r1 = "(?<=\\w)-\\s*\n\\s*".r;
val r1 = "(?<=\\w)-\\s*\n\\s*".r

// add spaces around non-word-internal punctuation
val r2 = "(?<=\\W)(\\p{P})(?! )".r;
val r3 = "(?! )(\\p{P})(?=\\W)".r;
val r2 = "(?<=\\W)(\\p{P})(?! )".r
val r3 = "(?! )(\\p{P})(?=\\W)".r

private val _instance = new V0();
def apply() = _instance;
private val _instance = new V0()
def apply() = _instance

def name = "SimpleEnglishTokenizer.V0"
}
Expand All @@ -74,51 +74,51 @@ object SimpleEnglishTokenizer {
apply(TextReader.fromString(in)).toIterable

def apply(in : TextReader) : Iterator[String] = new Iterator[String] {
var nv : String = null;
var sb = new java.lang.StringBuilder();
var nv : String = null
var sb = new java.lang.StringBuilder()

prepare();
prepare()

private def prepare() {
in.skipWhitespace();
in.skipWhitespace()

val cp = in.peek();
val cp = in.peek()

if (cp == -1) {
nv = null;
nv = null
} else if (Character.isLetterOrDigit(cp)) {
nv = in.readWhile(Character.isLetterOrDigit);
nv = in.readWhile(Character.isLetterOrDigit)
if (Unicode.isPunctuation(in.peek(0)) && Character.isLetterOrDigit(in.peek(1))) {
sb.setLength(0);
sb.append(nv);
sb.setLength(0)
sb.append(nv)
do {
sb.append(Character.toChars(in.read));
sb.append(in.readWhile(Character.isLetterOrDigit));
} while (Unicode.isPunctuation(in.peek(0)) && Character.isLetterOrDigit(in.peek(1)));
nv = sb.toString;
sb.append(Character.toChars(in.read))
sb.append(in.readWhile(Character.isLetterOrDigit))
} while (Unicode.isPunctuation(in.peek(0)) && Character.isLetterOrDigit(in.peek(1)))
nv = sb.toString
}
} else if (Unicode.isPunctuation(cp)) {
nv = in.readWhile(Unicode.isPunctuation);
nv = in.readWhile(Unicode.isPunctuation)
} else {
nv = in.readWhile((c : Int) => !Character.isWhitespace(c));
nv = in.readWhile((c : Int) => !Character.isWhitespace(c))
}
}

def hasNext =
nv != null;
nv != null

def next = {
val rv = nv;
prepare();
rv;
val rv = nv
prepare()
rv
}
}
}

object V1 {

private val _instance = new V1();
def apply() = _instance;
private val _instance = new V1()
def apply() = _instance

}
}
Loading