-
Notifications
You must be signed in to change notification settings - Fork 4
/
model.py
61 lines (48 loc) · 2 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import torch
from transformers import AutoModelForPreTraining
class BaseRecognizer(torch.nn.Module):
def __init__(self, model, feature_size, vocab_size):
super().__init__()
self.head = torch.nn.Linear(feature_size, vocab_size)
def _get_features(self, input_values, attention_mask):
raise NotImplementedError
def get_feat_length(self, input_length):
raise NotImplementedError
def forward(self, inputs):
features = self._get_features(inputs)
logits = self.head(features)
return logits, features
class Wav2Vec2Recognizer(BaseRecognizer):
def __init__(self, model, vocab_size):
feature_size = {
"facebook/wav2vec2-base": 256,
"facebook/wav2vec2-xls-r-300m": 1024,
}[model]
super().__init__(model, feature_size, vocab_size)
self.net = AutoModelForPreTraining.from_pretrained(model)
self.get_feat_length = self.net._get_feat_extract_output_lengths
def freeze_conv_features(self):
self.net.freeze_feature_encoder()
def _get_features(self, inputs):
outputs = self.net(inputs)
return outputs[0]
class Wav2Vec2ConvRecognizer(BaseRecognizer):
def __init__(self, model, vocab_size):
feature_size = {
"facebook/wav2vec2-base": 512,
"facebook/wav2vec2-xls-r-300m": 512,
}[model]
super().__init__(model, feature_size, vocab_size)
net = AutoModelForPreTraining.from_pretrained(model)
self.extractor = net.wav2vec2.feature_extractor
self.projector = net.wav2vec2.feature_projection
self.get_feat_length = net._get_feat_extract_output_lengths
def _get_features(self, inputs):
feats = self.extractor(inputs)
feats = feats.transpose(1, 2)
_, feats = self.projector(feats)
return feats
def freeze_conv_features(self):
for module in (self.extractor, self.projector):
for param in module.parameters():
param.requires_grad = False