Skip to content

Commit

Permalink
Implement more advanced compression algorithm
Browse files Browse the repository at this point in the history
 - the difference is in splitting. Now it does it intelligently, using
   more delimiters and grouping them together where possible.
  • Loading branch information
avolny committed Apr 7, 2020
1 parent 4d65b2d commit 029708f
Showing 1 changed file with 35 additions and 8 deletions.
43 changes: 35 additions & 8 deletions src/main/java/str_exporter/StringCompression.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ public int compareTo(Word o) {
}

public static String compress(String s) {
return compress(s, "\\s;:,./");
}

public static String compress(String s, String delims) {

long start = System.nanoTime();

Expand All @@ -46,13 +50,14 @@ public static String compress(String s) {
s = s.replace('|', ':').replace('&', ':');
ArrayList<String> compressionDict = new ArrayList<>();

int[] ns = {1, 10, 9, 8, 7, 6, 5, 4, 3, 2};
int[] ns;

ns = new int[]{1, 19, 17, 15, 13, 11, 9, 7, 5, 3};

// int n = 1;
for (int i = 0; i < ns.length && compressionDict.size() < WILDCARDS.length(); i++) {
// System.out.printf("n=%d\n", n);

String[] parts = s.split(" ");
String[] parts = splitStringSmart(s, delims);

int n = Math.min(ns[i], parts.length);

String[] ngrams = new String[parts.length - n + 1];
Expand All @@ -63,14 +68,10 @@ public static String compress(String s) {
StringBuilder ngram = new StringBuilder();
for (int k = 0; k < n; k++) {
ngram.append(parts[j + k]);
if (k < n - 1) {
ngram.append(' ');
}
}

ngrams[j] = ngram.toString();
}

// compute cost saving of compressing each n-gram
for (int j = 0; j < ngrams.length; j++) {
String word = ngrams[j];
Expand Down Expand Up @@ -134,4 +135,30 @@ public static String compress(String s) {
// SlayTheRelicsExporter.logger.info(String.format("compression, original len: %s new len: %s ratio %.2f, duration %.2f ms", uncompressedLength, compressedLength, compressedLength * 1f / uncompressedLength, (end-start)/1e6));
return s;
}

private static String[] splitStringSmart(String s, String delims) {

StringTokenizer tokenizer = new StringTokenizer(s, delims, true);
ArrayList<String> parts = new ArrayList<>();
boolean last_delim = false;
while(tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();

if(token.length() == 1 && delims.contains(token)) {
if(last_delim) {
int index = parts.size() - 1;
parts.set(index, parts.get(index) + token);
} else {
parts.add(token);
}
last_delim = true;
} else {
parts.add(token);
last_delim = false;
}
}

return parts.toArray(new String[0]);
}

}

0 comments on commit 029708f

Please sign in to comment.