-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_test.go
228 lines (207 loc) · 6.3 KB
/
main_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
package chunker
import (
"fmt"
"reflect"
"testing"
)
var exampleText = `
Argentina,[a] officially the Argentine Republic,[b] is a country in the southern half of
South America. Argentina covers an area of 2,780,400 km2 (1,073,500 sq mi),[B] making it
the second-largest country in South America after Brazil, the fourth-largest country in
the Americas, and the eighth-largest country in the world. It shares the bulk of the
Southern Cone with Chile to the west, and is also bordered by Bolivia and Paraguay to
the north, Brazil to the northeast, Uruguay and the South Atlantic Ocean to the east,
and the Drake Passage to the south. Argentina is a federal state subdivided into
twenty-three provinces, and one autonomous city, which is the federal capital and
largest city of the nation, Buenos Aires. The provinces and the capital have their
own constitutions, but exist under a federal system. Argentina claims sovereignty
over the Falkland Islands, South Georgia and the South Sandwich Islands, the Southern
Patagonian Ice Field, and a part of Antarctica.`
func TestChunker_Chunk(t *testing.T) {
type args struct {
data string
}
tests := []struct {
chunker *Chunker
name string
args args
wantChunks int
maxSize int
}{
{
chunker: NewChunker(40, 10, DefaultSeparators, true, false),
name: "Test demo",
args: args{
data: "This is a test string. It is used to test the chunker. It is a very simple chunker.",
},
wantChunks: 3,
maxSize: 40,
},
{
chunker: NewChunker(150, 30, DefaultSeparators, true, false),
name: "Example with wikipedia text",
args: args{
data: exampleText,
},
wantChunks: 9,
maxSize: 150,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := tt.chunker.Chunk(tt.args.data)
for _, chunk := range got {
// fmt.Println("Chunk `"+chunk+"` [ Length", len(chunk), "]")
if len(chunk) > tt.maxSize {
t.Errorf("Chunker.Chunk() = %v, want %v", len(chunk), tt.maxSize)
}
}
if len(got) != tt.wantChunks {
t.Errorf("Chunker.Chunk() = %v, want %v", len(got), tt.wantChunks)
}
})
}
}
func Test_findFirstSeparator(t *testing.T) {
type args struct {
chunk string
separators []string
}
tests := []struct {
name string
args args
want int
}{
{
name: "Test find first separator",
args: args{
chunk: "Testing the logic of findFirstSeparator",
separators: DefaultSeparators,
},
want: 8,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := findFirstSeparator(tt.args.chunk, tt.args.separators); got != tt.want {
t.Errorf("findFirstSeparator() = %v, want %v", got, tt.want)
}
})
}
}
func Test_findLastSeparator(t *testing.T) {
type args struct {
chunk string
separators []string
}
tests := []struct {
name string
args args
want int
}{
{
name: "Test find last separator",
args: args{
chunk: "Testing the logic of findLastSeparator",
separators: DefaultSeparators,
},
want: 20,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got, _ := findLastSeparator(tt.args.chunk, tt.args.separators, 0); got != tt.want {
t.Errorf("findLastSeparator() = %v, want %v", got, tt.want)
}
})
}
}
func BenchmarkChunk_Example1KB(b *testing.B) {
characters := []string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\n"}
size := int64(1024)
content := make([]byte, size)
for i := 0; i < len(content); i++ {
content[i] = characters[i%len(characters)][0]
}
chunker := NewChunker(256, 32, DefaultSeparators, true, false)
b.Run(fmt.Sprintf("input_size_%d(%d/%d)", len(content), 256, 32), func(b *testing.B) {
for i := 0; i < b.N; i++ {
chunker.Chunk(string(content))
}
})
}
func BenchmarkChunk_Example1MB(b *testing.B) {
characters := []string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\n"}
size := int64(1024 * 1024)
content := make([]byte, size)
for i := 0; i < len(content); i++ {
content[i] = characters[i%len(characters)][0]
}
chunker := NewChunker(512, 64, DefaultSeparators, true, false)
b.Run(fmt.Sprintf("input_size_%d(%d/%d)", len(content), 512, 64), func(b *testing.B) {
for i := 0; i < b.N; i++ {
chunker.Chunk(string(content))
}
})
}
func BenchmarkChunk_Example5MB(b *testing.B) {
characters := []string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\n"}
size := int64(5 * 1024 * 1024)
content := make([]byte, size)
for i := 0; i < len(content); i++ {
content[i] = characters[i%len(characters)][0]
}
chunker := NewChunker(512, 64, DefaultSeparators, true, false)
b.Run(fmt.Sprintf("input_size_%d(%d/%d)", len(content), 512, 64), func(b *testing.B) {
for i := 0; i < b.N; i++ {
chunker.Chunk(string(content))
}
})
}
func BenchmarkChunk_Example10MB(b *testing.B) {
characters := []string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\n"}
size := int64(10 * 1024 * 1024)
content := make([]byte, size)
for i := 0; i < len(content); i++ {
content[i] = characters[i%len(characters)][0]
}
chunker := NewChunker(1024, 128, DefaultSeparators, true, false)
b.Run(fmt.Sprintf("input_size_%d(%d/%d)", len(content), 1024, 128), func(b *testing.B) {
for i := 0; i < b.N; i++ {
chunker.Chunk(string(content))
}
})
}
func TestChunkSentences(t *testing.T) {
type args struct {
data string
}
tests := []struct {
name string
args args
want []string
}{
{
name: "Test chunk sentences",
args: args{
data: `This is a test string. It is used to test the chunker. It is a very simple chunker.
Mrs. Jones and Mr. Brown. What is this? I don't know.`,
},
want: []string{
"This is a test string.",
"It is used to test the chunker.",
"It is a very simple chunker.",
"Mrs. Jones and Mr. Brown.",
"What is this?",
"I don't know.",
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := ChunkSentences(tt.args.data); !reflect.DeepEqual(got, tt.want) {
t.Errorf("ChunkSentences() = %+v, want %+v", got, tt.want)
}
})
}
}