Skip to content

Commit

Permalink
Remove Hard-coded hann window
Browse files Browse the repository at this point in the history
  • Loading branch information
bobqianic authored Aug 13, 2023
1 parent 0a5f435 commit 65fd0e1
Showing 1 changed file with 11 additions and 107 deletions.
118 changes: 11 additions & 107 deletions whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2361,8 +2361,8 @@ static void dft(const std::vector<float> & in, std::vector<float> & out) {

for (int n = 0; n < N; n++) {
float angle = 2*M_PI*k*n/N;
re += in[n]*cos(angle);
im -= in[n]*sin(angle);
re += in[n]*cosf(angle);
im -= in[n]*sinf(angle);
}

out[k*2 + 0] = re;
Expand Down Expand Up @@ -2413,8 +2413,8 @@ static void fft(const std::vector<float> & in, std::vector<float> & out) {
for (int k = 0; k < N/2; k++) {
float theta = 2*M_PI*k/N;

float re = cos(theta);
float im = -sin(theta);
float re = cosf(theta);
float im = -sinf(theta);

float re_odd = odd_fft[2*k + 0];
float im_odd = odd_fft[2*k + 1];
Expand Down Expand Up @@ -2506,109 +2506,13 @@ static bool log_mel_spectrogram(
whisper_mel & mel) {
const int64_t t_start_us = ggml_time_us();

// Hanning window (Hard-coded to eliminate difference)
// Hanning window (Use cosf to eliminate difference)
// ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
std::vector<float> hann = {0.0, 6.16908073425293e-05, 0.0002467334270477295, 0.0005550682544708252,
0.000986635684967041, 0.0015413463115692139, 0.0022190213203430176, 0.0030195116996765137,
0.003942638635635376, 0.004988163709640503, 0.006155818700790405, 0.007445335388183594,
0.008856385946273804, 0.010388582944869995, 0.012041628360748291, 0.013815045356750488,
0.01570841670036316, 0.01772129535675049, 0.019853144884109497, 0.022103488445281982,
0.02447172999382019, 0.026957333087921143, 0.029559612274169922, 0.03227800130844116,
0.03511175513267517, 0.03806024789810181, 0.0411226749420166, 0.044298380613327026,
0.04758647084236145, 0.05098623037338257, 0.05449673533439636, 0.058117181062698364,
0.06184667348861694, 0.0656842589378357, 0.06962898373603821, 0.07367992401123047,
0.0778360664844513, 0.08209633827209473, 0.08645972609519958, 0.09092515707015991,
0.09549149870872498, 0.10015767812728882, 0.10492250323295593, 0.1097848117351532,
0.11474338173866272, 0.11979702115058899, 0.12494447827339172, 0.13018447160720825,
0.1355157196521759, 0.14093685150146484, 0.1464466154575348, 0.15204361081123352,
0.1577264666557312, 0.16349375247955322, 0.16934409737586975, 0.1752760112285614,
0.18128803372383118, 0.18737870454788208, 0.19354650378227234, 0.1997898817062378,
0.20610737800598145, 0.21249738335609436, 0.21895831823349, 0.2254886031150818,
0.23208662867546082, 0.23875075578689575, 0.24547931551933289, 0.2522706985473633,
0.25912320613861084, 0.26603513956069946, 0.27300477027893066, 0.2800304591655731,
0.2871103882789612, 0.29424285888671875, 0.30142611265182495, 0.30865830183029175,
0.31593772768974304, 0.3232625722885132, 0.3306310474872589, 0.3380413055419922,
0.34549152851104736, 0.352979838848114, 0.3605044484138489, 0.3680635094642639,
0.37565508484840393, 0.38327735662460327, 0.3909284174442291, 0.39860638976097107,
0.4063093662261963, 0.41403549909591675, 0.42178282141685486, 0.4295494258403778,
0.43733343482017517, 0.44513291120529175, 0.45294591784477234, 0.46077051758766174,
0.46860480308532715, 0.4764467775821686, 0.4842946231365204, 0.492146372795105,
0.5, 0.5078536868095398, 0.515705406665802, 0.5235532522201538,
0.5313953161239624, 0.5392295718193054, 0.5470541715621948, 0.5548672080039978,
0.562666654586792, 0.5704506635665894, 0.5782172679901123, 0.5859646201133728,
0.5936906933784485, 0.6013936996459961, 0.609071671962738, 0.6167227625846863,
0.6243450045585632, 0.6319366097450256, 0.6394955515861511, 0.6470202207565308,
0.6545085310935974, 0.6619587540626526, 0.6693689823150635, 0.6767374277114868,
0.6840623021125793, 0.691341757774353, 0.6985740065574646, 0.7057572603225708,
0.7128896713256836, 0.719969630241394, 0.7269952893257141, 0.7339649796485901,
0.7408769130706787, 0.7477294206619263, 0.7545207738876343, 0.761249303817749,
0.7679134607315063, 0.774511456489563, 0.7810417413711548, 0.7875027060508728,
0.7938927412033081, 0.800210177898407, 0.8064535856246948, 0.8126214146614075,
0.8187121152877808, 0.8247240781784058, 0.8306560516357422, 0.8365063667297363,
0.8422735929489136, 0.8479564785957336, 0.8535534143447876, 0.8590631484985352,
0.8644843101501465, 0.8698155879974365, 0.8750555515289307, 0.8802030086517334,
0.8852566480636597, 0.8902152180671692, 0.8950775265693665, 0.899842381477356,
0.9045084714889526, 0.9090749025344849, 0.9135403037071228, 0.9179036617279053,
0.9221639633178711, 0.9263200759887695, 0.9303710460662842, 0.9343158006668091,
0.9381533861160278, 0.941882848739624, 0.945503294467926, 0.9490138292312622,
0.9524135589599609, 0.9557017087936401, 0.9588773250579834, 0.961939811706543,
0.9648882746696472, 0.9677220582962036, 0.9704403877258301, 0.9730427265167236,
0.9755282998085022, 0.9778965711593628, 0.9801468849182129, 0.9822787046432495,
0.9842916131019592, 0.9861849546432495, 0.9879584312438965, 0.9896113872528076,
0.9911436438560486, 0.9925546646118164, 0.9938441514968872, 0.9950118064880371,
0.996057391166687, 0.9969804883003235, 0.997780978679657, 0.9984586238861084,
0.999013364315033, 0.9994449615478516, 0.9997532367706299, 0.9999383091926575,
1.0, 0.9999383091926575, 0.9997532367706299, 0.9994449615478516,
0.999013364315033, 0.9984586238861084, 0.997780978679657, 0.9969804286956787,
0.9960573315620422, 0.9950118064880371, 0.9938441514968872, 0.9925546646118164,
0.9911435842514038, 0.9896113872528076, 0.9879583716392517, 0.9861849546432495,
0.9842915534973145, 0.9822787046432495, 0.9801468253135681, 0.9778964519500732,
0.9755282402038574, 0.9730426073074341, 0.9704403877258301, 0.9677219390869141,
0.9648882150650024, 0.9619396924972534, 0.9588772654533386, 0.9557015895843506,
0.9524134397506714, 0.9490137100219727, 0.9455032348632812, 0.9418827295303345,
0.9381532669067383, 0.9343156814575195, 0.9303709268569946, 0.9263200759887695,
0.9221639633178711, 0.9179036617279053, 0.913540244102478, 0.9090747833251953,
0.9045084714889526, 0.8998422622680664, 0.8950774669647217, 0.8902151584625244,
0.8852565884590149, 0.8802029490470886, 0.8750554919242859, 0.869815468788147,
0.8644842505455017, 0.8590630888938904, 0.853553295135498, 0.8479562997817993,
0.842273473739624, 0.836506187915802, 0.8306558728218079, 0.8247239589691162,
0.8187118768692017, 0.8126212358474731, 0.8064534664154053, 0.8002099990844727,
0.793892502784729, 0.7875025272369385, 0.7810416221618652, 0.7745113372802734,
0.767913281917572, 0.7612491846084595, 0.7545205950737, 0.7477291822433472,
0.7408767342567444, 0.7339648008346558, 0.7269951105117798, 0.7199694514274597,
0.7128894925117493, 0.7057570219039917, 0.6985738277435303, 0.6913415789604187,
0.684062123298645, 0.6767372488975525, 0.6693688035011292, 0.6619585752487183,
0.6545083522796631, 0.6470199823379517, 0.6394953727722168, 0.6319363117218018,
0.6243447661399841, 0.6167224645614624, 0.6090714335441589, 0.601393461227417,
0.5936904549598694, 0.5859643220901489, 0.5782170295715332, 0.5704504251480103,
0.5626664161682129, 0.5548669099807739, 0.5470539331436157, 0.5392293334007263,
0.5313950181007385, 0.5235530138015747, 0.5157051682472229, 0.507853627204895,
0.5, 0.4921463429927826, 0.484294593334198, 0.4764467477798462,
0.46860471367836, 0.4607704281806946, 0.4529458284378052, 0.4451328217983246,
0.437333345413208, 0.42954933643341064, 0.4217827320098877, 0.4140354096889496,
0.4063093066215515, 0.3986063003540039, 0.39092832803726196, 0.3832772672176361,
0.37565499544143677, 0.36806342005729675, 0.3605043888092041, 0.35297977924346924,
0.3454914391040802, 0.338041216135025, 0.33063095808029175, 0.3232625126838684,
0.3159376382827759, 0.3086581826210022, 0.3014259934425354, 0.2942427396774292,
0.28711026906967163, 0.2800303101539612, 0.2730046510696411, 0.2660350203514099,
0.2591230869293213, 0.25227057933807373, 0.24547919631004333, 0.2387506067752838,
0.23208650946617126, 0.22548848390579224, 0.21895819902420044, 0.2124972641468048,
0.2061072587966919, 0.19978976249694824, 0.1935463547706604, 0.18737855553627014,
0.18128788471221924, 0.17527586221694946, 0.1693439483642578, 0.16349363327026367,
0.15772631764411926, 0.15204349160194397, 0.14644649624824524, 0.1409367322921753,
0.13551557064056396, 0.1301843225955963, 0.12494435906410217, 0.11979690194129944,
0.11474326252937317, 0.10978469252586365, 0.10492238402366638, 0.10015755891799927,
0.09549137949943542, 0.09092503786087036, 0.08645960688591003, 0.08209621906280518,
0.07783591747283936, 0.07367980480194092, 0.06962886452674866, 0.06568413972854614,
0.06184655427932739, 0.0581170916557312, 0.0544966459274292, 0.05098611116409302,
0.04758638143539429, 0.044298261404037476, 0.04112258553504944, 0.038060128688812256,
0.03511166572570801, 0.03227788209915161, 0.02955952286720276, 0.02695724368095398,
0.024471670389175415, 0.02210339903831482, 0.01985308527946472, 0.017721205949783325,
0.015708357095718384, 0.0138150155544281, 0.012041598558425903, 0.010388582944869995,
0.008856356143951416, 0.007445335388183594, 0.006155818700790405, 0.004988163709640503,
0.003942638635635376, 0.0030195116996765137, 0.0022190213203430176, 0.0015413165092468262,
0.000986635684967041, 0.0005550682544708252, 0.0002467334270477295, 6.16908073425293e-05};
std::vector<float> hann(frame_size);
for (int i = 0; i < frame_size; i++) {
hann[i] = 0.5*(1.0 - cosf((2.0*M_PI*i)/(frame_size)));
}

// Calculate the length of padding
int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
Expand Down Expand Up @@ -3122,8 +3026,8 @@ int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, st
}

// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads, bool debug) {
return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads, debug);
int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads, false);
}

// same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2
Expand Down

0 comments on commit 65fd0e1

Please sign in to comment.