-
Notifications
You must be signed in to change notification settings - Fork 1.9k
/
SuffixArray.java
179 lines (154 loc) · 6.07 KB
/
SuffixArray.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
package com.jwetherell.algorithms.data_structures;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
/**
* In computer science, a suffix array is a sorted array of all suffixes of a string.
* It is a data structure used, among others, in full text indices, data compression
* algorithms and within the field of bibliometrics.
* <p>
* @see <a href="https://en.wikipedia.org/wiki/Suffix_array">Suffix Array (Wikipedia)</a>
* <p>
* NOTE: This implementation returns starting indexes instead of full suffixes
* <br>
* @author Jakub Szarawarski <[email protected]>
* @author Justin Wetherell <[email protected]>
*/
public class SuffixArray {
private static final StringBuilder STRING_BUILDER = new StringBuilder();
private static final char DEFAULT_END_SEQ_CHAR = '$';
private final char endSeqChar;
private String string;
private ArrayList<Integer> suffixArray;
private ArrayList<Integer> KMRarray;
public SuffixArray(CharSequence sequence) {
this(sequence, DEFAULT_END_SEQ_CHAR);
}
public SuffixArray(CharSequence sequence, char endChar) {
endSeqChar = endChar;
string = buildStringWithEndChar(sequence);
}
public ArrayList<Integer> getSuffixArray() {
if (suffixArray == null)
KMRalgorithm();
return suffixArray;
}
/**
* @return inverted suffix array
*/
public ArrayList<Integer> getKMRarray() {
if (KMRarray == null)
KMRalgorithm();
return KMRarray;
}
public String getString(){
return string;
}
/**
* Creates suffix array using KMR algorithm with O(n log^2 n) complexity.
*
* For radius r:
* KMR[i] == k,
* when string[i..i+r-1] is kth r-letter substring of string sorted lexicographically
* KMR is counted for radius = 1,2,4,8 ...
* KMR for radius bigger than string length is the inverted suffix array
*/
private void KMRalgorithm() {
final int length = string.length();
ArrayList<KMRsWithIndex> KMRinvertedList = new ArrayList<KMRsWithIndex>();
ArrayList<Integer> KMR = getBasicKMR(length);
int radius = 1;
while (radius < length) {
KMRinvertedList = getKMRinvertedList(KMR, radius, length);
KMR = getKMR(KMRinvertedList, length);
radius *= 2;
}
KMRarray = new ArrayList<Integer>(KMR.subList(0, length));
suffixArray = new ArrayList<Integer>();
for (KMRsWithIndex kmr : KMRinvertedList) {
suffixArray.add(new Integer(kmr.index));
}
}
/**
* Creates KMR array for new radius from nearly inverted array.
* Elements from inverted array need to be grouped by substring tey represent.
*
* @param KMRinvertedList indexes are nearly inverted KMR array
* @param length string length
* @return KMR array for new radius
*/
private ArrayList<Integer> getKMR(ArrayList<KMRsWithIndex> KMRinvertedList, int length) {
final ArrayList<Integer> KMR = new ArrayList<Integer>(length*2);
for (int i=0; i<2*length; i++)
KMR.add(new Integer(-1));
int counter = 0;
for (int i=0; i<length; i++){
if(i>0 && substringsAreEqual(KMRinvertedList, i))
counter++;
KMR.set(KMRinvertedList.get(i).index, new Integer(counter));
}
return KMR;
}
private boolean substringsAreEqual(ArrayList<KMRsWithIndex> KMRinvertedList, int i) {
return (KMRinvertedList.get(i-1).beginKMR.equals(KMRinvertedList.get(i).beginKMR) == false) ||
(KMRinvertedList.get(i-1).endKMR.equals(KMRinvertedList.get(i).endKMR) == false);
}
/**
* helper method to create KMR array for radius = radius from KMR array for radius = radius/2
*
* @param KMR KMR array for radius = radius/2
* @param radius new radius
* @param length string length
* @return list of KMRsWithIndex which indexes are nearly inverted KMR array
*/
private ArrayList<KMRsWithIndex> getKMRinvertedList(ArrayList<Integer> KMR, int radius, int length) {
final ArrayList<KMRsWithIndex> KMRinvertedList = new ArrayList<KMRsWithIndex>();
for (int i=0; i<length; i++)
KMRinvertedList.add(new KMRsWithIndex(KMR.get(i), KMR.get(i+radius), i));
Collections.sort(KMRinvertedList,
new Comparator<KMRsWithIndex>() {
@Override
public int compare(KMRsWithIndex A, KMRsWithIndex B) {
if (A.beginKMR.equals(B.beginKMR) == false)
return A.beginKMR.compareTo(B.beginKMR);
if (A.endKMR.equals(B.endKMR) == false)
return A.endKMR.compareTo(B.endKMR);
return A.index.compareTo(B.index);
}
}
);
return KMRinvertedList;
}
/**
* KMR array for radius=1, instead of initial natural numbers ascii codes are used
*
* @param length length of string
* @return pseudo KMR array for radius=1
*/
private ArrayList<Integer> getBasicKMR(int length) {
final ArrayList<Integer> result = new ArrayList<Integer>(length*2);
final char[] characters = string.toCharArray();
for (int i=0; i<length; i++)
result.add(new Integer(characters[i]));
for (int i=0; i<length; i++)
result.add(new Integer(-1));
return result;
}
private String buildStringWithEndChar(CharSequence sequence) {
STRING_BUILDER.setLength(0);
STRING_BUILDER.append(sequence);
if (STRING_BUILDER.indexOf(String.valueOf(endSeqChar)) < 0)
STRING_BUILDER.append(endSeqChar);
return STRING_BUILDER.toString();
}
private class KMRsWithIndex{
Integer beginKMR;
Integer endKMR;
Integer index;
KMRsWithIndex(Integer begin, Integer end, Integer index){
this.beginKMR = begin;
this.endKMR = end;
this.index = index;
}
}
}