forked from BR-IDL/PaddleViT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
225 lines (188 loc) · 7.58 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utilities"""
import copy
import pickle
import numpy as np
import paddle
import paddle.distributed as dist
from paddle.optimizer.lr import LRScheduler
class AverageMeter():
""" Meter for monitoring losses"""
def __init__(self):
self.avg = 0
self.sum = 0
self.cnt = 0
self.reset()
def reset(self):
"""reset all values to zeros"""
self.avg = 0
self.sum = 0
self.cnt = 0
def update(self, val, n=1):
"""update avg by val and n, where val is the avg of n values"""
self.sum += val * n
self.cnt += n
self.avg = self.sum / self.cnt
def _max_by_axis(the_list):
maxes = the_list[0]
for sublist in the_list[1:]:
for idx, item in enumerate(sublist):
maxes[idx] = max(maxes[idx], item)
return maxes
class NestedTensor():
"""Each NestedTensor has .tensor and .mask attributes, which are paddle.Tensors"""
def __init__(self, tensors, mask):
self.tensors = tensors
self.mask = mask
def decompose(self):
return self.tensors, self.mask
def __repr__(self):
return str(self.tensors)
def nested_tensor_from_tensor_list(tensor_list, size_divisibility):
"""make the batch handle different image sizes
This method take a list of tensors with different sizes,
then max size is selected as the final batch size,
smaller samples are padded with zeros(bottom-right),
and corresponding masks are generated.
"""
max_size = _max_by_axis([list(img.shape) for img in tensor_list])
if size_divisibility > 1:
stride = size_divisibility
max_size[1] = (max_size[1] + (stride -1)) // stride * stride
max_size[2] = (max_size[2] + (stride -1)) // stride * stride
batch_shape = [len(tensor_list)] + max_size # len is the num of images in this batch
b, c, h, w = batch_shape
dtype = tensor_list[0].dtype
data_tensor = paddle.zeros(batch_shape, dtype=dtype)
mask = paddle.ones((b, h, w), dtype='int32')
# zip has broadcast for tensor and mask
#print('===== inside nested_tensor_from_tensor_list')
# zip cannot used in paddle, which will create a new tensor. in pytorch it works well
#for img, pad_img, m in zip(tensor_list, tensor, mask):
# pad_img[: img.shape[0], : img.shape[1], : img.shape[2]] = img
# m[: img.shape[0], :img.shape[1]] = 0
for idx in range(b):
s0 = tensor_list[idx].shape[0]
s1 = tensor_list[idx].shape[1]
s2 = tensor_list[idx].shape[2]
# direct set value raise error in current env, we use numpy to bypass
#data_tensor[idx, : s0, : s1, : s2] = tensor_list[idx].cpu().numpy()
data_tensor[idx, : s0, : s1, : s2] = tensor_list[idx]
mask[idx, : s1, : s2] = 0
return NestedTensor(data_tensor, mask)
def reduce_dict(input_dict, average=True):
"""Impl all_reduce for dict of tensors in DDP"""
world_size = dist.get_world_size()
if world_size < 2:
return input_dict
with paddle.no_grad():
names = []
values = []
for k in sorted(input_dict.keys()):
names.append(k)
values.append(input_dict[k])
values = paddle.stack(values, axis=0)
dist.all_reduce(values)
if average:
values /= world_size
reduced_dict = {k: v for k, v in zip(names, values)}
return reduced_dict
@paddle.no_grad()
def accuracy(output, target, topk=(1,)):
if target.numel() == 0:
return [paddle.zeros([])]
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.reshape(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].reshape(-1).astype('float32').sum(0)
res.append(correct_k.mul_(100.0 / batch_size))
return res
class WarmupCosineScheduler(LRScheduler):
"""Warmup Cosine Scheduler
First apply linear warmup, then apply cosine decay schedule.
Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
"total_epochs - warmup_epochs"
Attributes:
learning_rate: the starting learning rate (without warmup), not used here!
warmup_start_lr: warmup starting learning rate
start_lr: the starting learning rate (without warmup)
end_lr: the ending learning rate after whole loop
warmup_epochs: # of epochs for warmup
total_epochs: # of total epochs (include warmup)
"""
def __init__(self,
learning_rate,
warmup_start_lr,
start_lr,
end_lr,
warmup_epochs,
total_epochs,
cycles=0.5,
last_epoch=-1,
verbose=False):
"""init WarmupCosineScheduler """
self.warmup_epochs = warmup_epochs
self.total_epochs = total_epochs
self.warmup_start_lr = warmup_start_lr
self.start_lr = start_lr
self.end_lr = end_lr
self.cycles = cycles
super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
def get_lr(self):
""" return lr value """
if self.last_epoch < self.warmup_epochs:
val = (self.start_lr - self.warmup_start_lr) * float(
self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
return val
progress = float(self.last_epoch - self.warmup_epochs) / float(
max(1, self.total_epochs - self.warmup_epochs))
val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
return val
def all_gather(data):
""" run all_gather on any picklable data (do not requires tensors)
Args:
data: picklable object
Returns:
data_list: list of data gathered from each rank
"""
world_size = dist.get_world_size()
if world_size == 1:
return [data]
buffer = pickle.dumps(data) #write data into Bytes and stores in buffer
np_buffer = np.frombuffer(buffer, dtype=np.int8)
tensor = paddle.to_tensor(np_buffer, dtype='int32') # uint8 doese not have many ops in paddle
# obtain Tensor size of each rank
local_size = paddle.to_tensor([tensor.shape[0]])
size_list = []
dist.all_gather(size_list, local_size)
max_size = max(size_list)
# receiving tensors from all ranks,
# all_gather does not support different shape, so we use padding
tensor_list = []
if local_size != max_size:
padding = paddle.empty(shape=(max_size - local_size, ), dtype='int32')
tensor = paddle.concat((tensor, padding), axis=0)
dist.all_gather(tensor_list, tensor)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.astype('uint8').cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer))
return data_list