-
Notifications
You must be signed in to change notification settings - Fork 1
/
separate_crops.py
403 lines (331 loc) · 13.8 KB
/
separate_crops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
import os
from collections import defaultdict
from tqdm import tqdm
import argparse
from natsort import os_sorted
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
import logging
import numpy as np
from collections import Counter
import math
from pathlib import Path
# Set up logger
logging.basicConfig(level=logging.INFO)
def read_label_files(folder: Path, crops: pd.DataFrame):
"""
Read labels from files and return a dictionary with classes and counts per file.
Returns a dictionary with class names as keys and a dictionary with filenames and counts as values.
"""
filename_classes = pd.DataFrame(columns=["filename", "classes"])
files = []
classes = defaultdict(lambda: defaultdict(int))
for file in tqdm(
folder.iterdir(), desc="Reading files", total=len(list(folder.iterdir()))
):
if file.suffix != ".txt" or "_classes" in file.stem:
continue
files.append(file)
classes_in_file = []
with file.open() as f:
for line in f:
class_name = line.strip().split()[0]
classes_in_file.append(class_name)
classes[class_name]["_".join(file.stem.split("_")[:-1])] += 1
# add a new row to filename_classes
filename_classes = pd.concat([
filename_classes,
pd.DataFrame({"filename": [file.stem], "classes": [classes_in_file]}),
])
return classes, crops.merge(filename_classes, on="filename")
def read_crops_location(file: Path):
"""
Read the crop location from a file.
The file should contain one line per crop, with the following format:
<filename> <x_min> <y_min> <x_max> <y_max>
Returns a dataframe with the filename and crop location, and the largest dimension of the all the crops.
"""
df = pd.DataFrame(columns=["filename", "paintingName", "location"])
max_dimension = 0
with file.open() as f:
curr_painting = ""
for line in f:
line = line.strip().split()
painting = line[0]
idx = 0 if painting != curr_painting else idx + 1
curr_painting = painting
filename = f"{painting}_{idx}"
x_min, y_min, x_max, y_max = [int(x) for x in line[1:]]
dimension = max(x_max - x_min, y_max - y_min)
max_dimension = max(max_dimension, dimension)
df = pd.concat([
df,
pd.DataFrame({
"filename": [filename],
"paintingName": painting,
"location": [line[1:]],
}),
])
return df, max_dimension
def read_painting_sizes(file: Path):
"""
Read the painting sizes from a file.
The file should contain one line per painting, with the following format:
<paintingName> <width> <height>
Returns a dataframe with the paintingName, width, height.
"""
df = pd.DataFrame(columns=["paintingName", "width", "height"])
with file.open() as f:
for line in f:
line = line.strip().split()
name = line[0]
width, height = [int(x) for x in line[1:]]
df = pd.concat([
df,
pd.DataFrame(
{"paintingName": [name], "width": [width], "height": [height]}
),
])
return df
def move_files(
files: pd.DataFrame,
label_dir: Path,
image_dir: Path,
label_dest_dir: Path,
image_dest_dir: Path,
name="files",
):
"""Move files mathcing the "filename" column in the files dataframe from the label_dir and image_dir to the label_dest_dir and image_dest_dir respectively."""
for _, row in tqdm(files.iterrows(), desc=f"Moving {name}", total=len(files)):
shutil.move(
label_dir / f"{row['filename']}.txt",
label_dest_dir / f"{row['filename']}.txt",
)
shutil.move(
image_dir / f"{row['filename']}.jpg",
image_dest_dir / f"{row['filename']}.jpg",
)
def max_cells(length, cell_min_size, gutter_size):
"""
Calculate the maximum number of cells of size >= cell_min_size that can fit in a given length, with a fixed gutter size between the cells.
"""
return math.floor((length + gutter_size) / (cell_min_size + gutter_size))
def separate_crops_into_cells(paintings, crops, crop_size=1080):
gutter_size = crop_size # The fixed size of the gutter between the grid cells
min_cell_size = 2 * crop_size # The minimum size of a grid cell
paintings["n_cells_width"] = paintings["width"].apply(
lambda x: max_cells(x, min_cell_size, gutter_size)
)
paintings["n_cells_height"] = paintings["height"].apply(
lambda x: max_cells(x, min_cell_size, gutter_size)
)
def get_grid_cell(row):
painting = paintings.loc[paintings["paintingName"] == row["paintingName"]]
cell_width = (
painting["width"] - (painting["n_cells_width"] - 1) * gutter_size
) / painting["n_cells_width"]
cell_height = (
painting["height"] - (painting["n_cells_height"] - 1) * gutter_size
) / painting["n_cells_height"]
x_min, y_min, x_max, y_max = [int(x) for x in row["location"]]
center_x, center_y = (x_min + x_max) / 2, (y_min + y_max) / 2
# Determine the grid cell coordinates
grid_x = math.floor(center_x / (cell_width + gutter_size))
grid_y = math.floor(center_y / (cell_height + gutter_size))
return (grid_x, grid_y)
# Apply the grid cell identification function to each crop
crops["grid_cell"] = crops.apply(get_grid_cell, axis=1)
return crops
def count_classes(df):
color_count = Counter()
for marble_list in df["classes"]:
color_count += Counter(marble_list)
return color_count
def counter_diff(counter1, counter2):
diff = 0
for key in set(counter1.keys()).union(set(counter2.keys())):
diff += abs(counter1[key] - counter2[key])
return diff
def assign_train_val_sets(
df, train_percentage, val_percentage, capacity_constraint_multiplier
):
"""
Assign each crop to either the train or validation set, while trying to keep the number of crops of each color in each set as close as possible.
capacity_constraint_multiplier: higher value means more weight on capacity constraint
"""
total_crops = len(df.index)
train_capacity = total_crops * train_percentage
val_capacity = total_crops * val_percentage
df["set"] = None
for paintingName in df["paintingName"].unique():
for grid_cell in df[df["paintingName"] == paintingName]["grid_cell"].unique():
bucket_df = df[
(df["paintingName"] == paintingName) & (df["grid_cell"] == grid_cell)
]
bucket_color_count = count_classes(bucket_df)
set_color_counts = [
count_classes(df[df["set"] == "train"]),
count_classes(df[df["set"] == "val"]),
]
color_diffs = [
counter_diff(set_color_counts[0], bucket_color_count),
counter_diff(set_color_counts[1], bucket_color_count),
]
# Increase the weight of color difference if the set exceeds capacity
if len(df[df["set"] == "train"]) > train_capacity:
color_diffs[0] *= capacity_constraint_multiplier
if len(df[df["set"] == "val"]) > val_capacity:
color_diffs[1] *= capacity_constraint_multiplier
if color_diffs[0] <= color_diffs[1]:
df.loc[bucket_df.index, "set"] = "train"
else:
df.loc[bucket_df.index, "set"] = "val"
def divide_train_val_sets(
df, train_percentage=0.8, val_percentage=0.2, capacity_constraint_multiplier=3
):
assign_train_val_sets(
df, train_percentage, val_percentage, capacity_constraint_multiplier
)
return [df[df["set"] == "train"], df[df["set"] == "val"]]
def print_stats(train_files, validation_files, len_both):
num_classes_train = count_classes(train_files)
for key in num_classes_train:
num_classes_train[key] /= len(train_files)
# sort the dictionary by value
num_classes_train = dict(
sorted(num_classes_train.items(), key=lambda item: item[1], reverse=True)
)
num_classes_val = count_classes(validation_files)
for key in num_classes_val:
num_classes_val[key] /= len(validation_files)
# sort the dictionary by value
num_classes_val = dict(
sorted(num_classes_val.items(), key=lambda item: item[1], reverse=True)
)
logging.info("Train and Validaton set class distribution:")
logging.info("only in train set:")
for key in num_classes_train:
if key not in num_classes_val:
logging.info(f"{key}: {num_classes_train[key]}")
logging.info("only in validation set:")
for key in num_classes_val:
if key not in num_classes_train:
logging.info(f"{key}: {num_classes_val[key]}")
logging.info("in both sets:")
distribution_df = pd.DataFrame(columns=["train", "valid"])
for key in num_classes_train:
if key in num_classes_val:
distribution_df.loc[key] = [num_classes_train[key], num_classes_val[key]]
logging.info(distribution_df)
# Print the actual split percentage
logging.info(
f"Train set size: {len(train_files.index)} ({len(train_files.index) / len_both * 100}% of all crops excluding test painting crops)"
)
logging.info(
f"Validation set size: {len(validation_files.index)} ({len(validation_files.index) / len_both * 100}% of all crops excluding test painting crops)"
)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("location", help="Location of the dataset")
parser.add_argument(
"--output", "-o", help="Location to save the separated dataset", default=None
)
parser.add_argument(
"--force", "-f", help="Force overwrite of existing files", action="store_true"
)
return parser.parse_args()
def print_welcome(args):
padding = 140
print("\n\n")
print(" SWORD-SIMP Dataset Separator ".center(padding, "8"))
print(f" Dataset location: {args.location} ".center(padding))
print(f" Output location: {args.output} ".center(padding))
print(f" Force overwrite: {args.force} ".center(padding))
print("".center(padding, "8"))
print("\n\n")
def main():
args = get_args()
print_welcome(args)
dataset_dir = Path(args.location)
output_dir = Path(args.output) if args.output else dataset_dir / "separated"
# paths to the files/directories
CROPS_LOCATION_FILE = dataset_dir / "cropped" / "crops.txt"
PAINTING_SIZES_FILE = dataset_dir / "painting_sizes.txt"
CLASSES_FILE = dataset_dir / "classes.txt"
OUT_YAML_FILE = output_dir / "dataset.yaml"
TEST_PAINTING_FILE = dataset_dir / "test_painting.txt"
LABELS_DIR = dataset_dir / "cropped" / "labels"
IMAGES_DIR = dataset_dir / "cropped" / "images"
TEST_IMAGES_DIR = dataset_dir / "images"
TEST_LABELS_DIR = dataset_dir / "labels"
OUTPUT_DIRS = {}
# Create the output directories
for split in ["train", "valid", "test"]:
for folder in ["images", "labels"]:
path = output_dir / split / folder
if path.exists():
if not args.force:
logging.error(
f"Output directory {path} already exists. Use -f to force overwrite."
)
exit()
logging.warning(f"Output directory {path} already exists. Overwriting.")
shutil.rmtree(path)
os.makedirs(path, exist_ok=True)
OUTPUT_DIRS[split + "_" + folder] = path
# Read test painting file
with open(TEST_PAINTING_FILE, "r") as file:
test_painting = file.readline().strip()
# Read the crops location file and the label files
crops, max_crop_dimension = read_crops_location(CROPS_LOCATION_FILE)
painting_classes, crops = read_label_files(LABELS_DIR, crops)
paintings = read_painting_sizes(PAINTING_SIZES_FILE)
# Get the ordered list of classes
with open(CLASSES_FILE, "r") as file:
classes = [line.strip() for line in file]
# Separate the crops into cells
crops = separate_crops_into_cells(paintings, crops, max_crop_dimension)
# TODO Visualize the cells
len_both = len(crops.index) # used for printing stats
# Separate the crops into train and validation sets
train_files, validation_files = divide_train_val_sets(crops)
# Print stats
print_stats(train_files, validation_files, len_both)
# TODO Visualize the train and validation sets
move_files(
train_files,
LABELS_DIR,
IMAGES_DIR,
OUTPUT_DIRS["train_labels"],
OUTPUT_DIRS["train_images"],
"train set",
)
move_files(
validation_files,
LABELS_DIR,
IMAGES_DIR,
OUTPUT_DIRS["valid_labels"],
OUTPUT_DIRS["valid_images"],
"validation set",
)
# Copy the test image psb into the test set
shutil.copy(
TEST_IMAGES_DIR / f"{test_painting}.psb",
OUTPUT_DIRS["test_images"] / f"{test_painting}.psb",
)
shutil.copy(
TEST_LABELS_DIR / f"{test_painting}.txt",
OUTPUT_DIRS["test_labels"] / f"{test_painting}.txt",
)
# save yaml file in the yolov5 format
with open(OUT_YAML_FILE, "w") as file:
file.write(f"train: {OUTPUT_DIRS['train_images'].relative_to(output_dir)}\n")
file.write(f"val: {OUTPUT_DIRS['valid_images'].relative_to(output_dir)}\n")
file.write(f"test: {OUTPUT_DIRS['test_images'].relative_to(output_dir)}\n")
file.write(f"nc: {len(classes)}\n")
file.write("names:\n")
for c in classes:
file.write(f"- {c}\n")
if __name__ == "__main__":
main()