diff --git a/whisper.cpp b/whisper.cpp index 42dc3405438..e72da2dc70a 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2361,8 +2361,8 @@ static void dft(const std::vector & in, std::vector & out) { for (int n = 0; n < N; n++) { float angle = 2*M_PI*k*n/N; - re += in[n]*cos(angle); - im -= in[n]*sin(angle); + re += in[n]*cosf(angle); + im -= in[n]*sinf(angle); } out[k*2 + 0] = re; @@ -2413,8 +2413,8 @@ static void fft(const std::vector & in, std::vector & out) { for (int k = 0; k < N/2; k++) { float theta = 2*M_PI*k/N; - float re = cos(theta); - float im = -sin(theta); + float re = cosf(theta); + float im = -sinf(theta); float re_odd = odd_fft[2*k + 0]; float im_odd = odd_fft[2*k + 1]; @@ -2506,109 +2506,13 @@ static bool log_mel_spectrogram( whisper_mel & mel) { const int64_t t_start_us = ggml_time_us(); - // Hanning window (Hard-coded to eliminate difference) + // Hanning window (Use cosf to eliminate difference) // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147 - std::vector hann = {0.0, 6.16908073425293e-05, 0.0002467334270477295, 0.0005550682544708252, - 0.000986635684967041, 0.0015413463115692139, 0.0022190213203430176, 0.0030195116996765137, - 0.003942638635635376, 0.004988163709640503, 0.006155818700790405, 0.007445335388183594, - 0.008856385946273804, 0.010388582944869995, 0.012041628360748291, 0.013815045356750488, - 0.01570841670036316, 0.01772129535675049, 0.019853144884109497, 0.022103488445281982, - 0.02447172999382019, 0.026957333087921143, 0.029559612274169922, 0.03227800130844116, - 0.03511175513267517, 0.03806024789810181, 0.0411226749420166, 0.044298380613327026, - 0.04758647084236145, 0.05098623037338257, 0.05449673533439636, 0.058117181062698364, - 0.06184667348861694, 0.0656842589378357, 0.06962898373603821, 0.07367992401123047, - 0.0778360664844513, 0.08209633827209473, 0.08645972609519958, 0.09092515707015991, - 0.09549149870872498, 0.10015767812728882, 0.10492250323295593, 0.1097848117351532, - 0.11474338173866272, 0.11979702115058899, 0.12494447827339172, 0.13018447160720825, - 0.1355157196521759, 0.14093685150146484, 0.1464466154575348, 0.15204361081123352, - 0.1577264666557312, 0.16349375247955322, 0.16934409737586975, 0.1752760112285614, - 0.18128803372383118, 0.18737870454788208, 0.19354650378227234, 0.1997898817062378, - 0.20610737800598145, 0.21249738335609436, 0.21895831823349, 0.2254886031150818, - 0.23208662867546082, 0.23875075578689575, 0.24547931551933289, 0.2522706985473633, - 0.25912320613861084, 0.26603513956069946, 0.27300477027893066, 0.2800304591655731, - 0.2871103882789612, 0.29424285888671875, 0.30142611265182495, 0.30865830183029175, - 0.31593772768974304, 0.3232625722885132, 0.3306310474872589, 0.3380413055419922, - 0.34549152851104736, 0.352979838848114, 0.3605044484138489, 0.3680635094642639, - 0.37565508484840393, 0.38327735662460327, 0.3909284174442291, 0.39860638976097107, - 0.4063093662261963, 0.41403549909591675, 0.42178282141685486, 0.4295494258403778, - 0.43733343482017517, 0.44513291120529175, 0.45294591784477234, 0.46077051758766174, - 0.46860480308532715, 0.4764467775821686, 0.4842946231365204, 0.492146372795105, - 0.5, 0.5078536868095398, 0.515705406665802, 0.5235532522201538, - 0.5313953161239624, 0.5392295718193054, 0.5470541715621948, 0.5548672080039978, - 0.562666654586792, 0.5704506635665894, 0.5782172679901123, 0.5859646201133728, - 0.5936906933784485, 0.6013936996459961, 0.609071671962738, 0.6167227625846863, - 0.6243450045585632, 0.6319366097450256, 0.6394955515861511, 0.6470202207565308, - 0.6545085310935974, 0.6619587540626526, 0.6693689823150635, 0.6767374277114868, - 0.6840623021125793, 0.691341757774353, 0.6985740065574646, 0.7057572603225708, - 0.7128896713256836, 0.719969630241394, 0.7269952893257141, 0.7339649796485901, - 0.7408769130706787, 0.7477294206619263, 0.7545207738876343, 0.761249303817749, - 0.7679134607315063, 0.774511456489563, 0.7810417413711548, 0.7875027060508728, - 0.7938927412033081, 0.800210177898407, 0.8064535856246948, 0.8126214146614075, - 0.8187121152877808, 0.8247240781784058, 0.8306560516357422, 0.8365063667297363, - 0.8422735929489136, 0.8479564785957336, 0.8535534143447876, 0.8590631484985352, - 0.8644843101501465, 0.8698155879974365, 0.8750555515289307, 0.8802030086517334, - 0.8852566480636597, 0.8902152180671692, 0.8950775265693665, 0.899842381477356, - 0.9045084714889526, 0.9090749025344849, 0.9135403037071228, 0.9179036617279053, - 0.9221639633178711, 0.9263200759887695, 0.9303710460662842, 0.9343158006668091, - 0.9381533861160278, 0.941882848739624, 0.945503294467926, 0.9490138292312622, - 0.9524135589599609, 0.9557017087936401, 0.9588773250579834, 0.961939811706543, - 0.9648882746696472, 0.9677220582962036, 0.9704403877258301, 0.9730427265167236, - 0.9755282998085022, 0.9778965711593628, 0.9801468849182129, 0.9822787046432495, - 0.9842916131019592, 0.9861849546432495, 0.9879584312438965, 0.9896113872528076, - 0.9911436438560486, 0.9925546646118164, 0.9938441514968872, 0.9950118064880371, - 0.996057391166687, 0.9969804883003235, 0.997780978679657, 0.9984586238861084, - 0.999013364315033, 0.9994449615478516, 0.9997532367706299, 0.9999383091926575, - 1.0, 0.9999383091926575, 0.9997532367706299, 0.9994449615478516, - 0.999013364315033, 0.9984586238861084, 0.997780978679657, 0.9969804286956787, - 0.9960573315620422, 0.9950118064880371, 0.9938441514968872, 0.9925546646118164, - 0.9911435842514038, 0.9896113872528076, 0.9879583716392517, 0.9861849546432495, - 0.9842915534973145, 0.9822787046432495, 0.9801468253135681, 0.9778964519500732, - 0.9755282402038574, 0.9730426073074341, 0.9704403877258301, 0.9677219390869141, - 0.9648882150650024, 0.9619396924972534, 0.9588772654533386, 0.9557015895843506, - 0.9524134397506714, 0.9490137100219727, 0.9455032348632812, 0.9418827295303345, - 0.9381532669067383, 0.9343156814575195, 0.9303709268569946, 0.9263200759887695, - 0.9221639633178711, 0.9179036617279053, 0.913540244102478, 0.9090747833251953, - 0.9045084714889526, 0.8998422622680664, 0.8950774669647217, 0.8902151584625244, - 0.8852565884590149, 0.8802029490470886, 0.8750554919242859, 0.869815468788147, - 0.8644842505455017, 0.8590630888938904, 0.853553295135498, 0.8479562997817993, - 0.842273473739624, 0.836506187915802, 0.8306558728218079, 0.8247239589691162, - 0.8187118768692017, 0.8126212358474731, 0.8064534664154053, 0.8002099990844727, - 0.793892502784729, 0.7875025272369385, 0.7810416221618652, 0.7745113372802734, - 0.767913281917572, 0.7612491846084595, 0.7545205950737, 0.7477291822433472, - 0.7408767342567444, 0.7339648008346558, 0.7269951105117798, 0.7199694514274597, - 0.7128894925117493, 0.7057570219039917, 0.6985738277435303, 0.6913415789604187, - 0.684062123298645, 0.6767372488975525, 0.6693688035011292, 0.6619585752487183, - 0.6545083522796631, 0.6470199823379517, 0.6394953727722168, 0.6319363117218018, - 0.6243447661399841, 0.6167224645614624, 0.6090714335441589, 0.601393461227417, - 0.5936904549598694, 0.5859643220901489, 0.5782170295715332, 0.5704504251480103, - 0.5626664161682129, 0.5548669099807739, 0.5470539331436157, 0.5392293334007263, - 0.5313950181007385, 0.5235530138015747, 0.5157051682472229, 0.507853627204895, - 0.5, 0.4921463429927826, 0.484294593334198, 0.4764467477798462, - 0.46860471367836, 0.4607704281806946, 0.4529458284378052, 0.4451328217983246, - 0.437333345413208, 0.42954933643341064, 0.4217827320098877, 0.4140354096889496, - 0.4063093066215515, 0.3986063003540039, 0.39092832803726196, 0.3832772672176361, - 0.37565499544143677, 0.36806342005729675, 0.3605043888092041, 0.35297977924346924, - 0.3454914391040802, 0.338041216135025, 0.33063095808029175, 0.3232625126838684, - 0.3159376382827759, 0.3086581826210022, 0.3014259934425354, 0.2942427396774292, - 0.28711026906967163, 0.2800303101539612, 0.2730046510696411, 0.2660350203514099, - 0.2591230869293213, 0.25227057933807373, 0.24547919631004333, 0.2387506067752838, - 0.23208650946617126, 0.22548848390579224, 0.21895819902420044, 0.2124972641468048, - 0.2061072587966919, 0.19978976249694824, 0.1935463547706604, 0.18737855553627014, - 0.18128788471221924, 0.17527586221694946, 0.1693439483642578, 0.16349363327026367, - 0.15772631764411926, 0.15204349160194397, 0.14644649624824524, 0.1409367322921753, - 0.13551557064056396, 0.1301843225955963, 0.12494435906410217, 0.11979690194129944, - 0.11474326252937317, 0.10978469252586365, 0.10492238402366638, 0.10015755891799927, - 0.09549137949943542, 0.09092503786087036, 0.08645960688591003, 0.08209621906280518, - 0.07783591747283936, 0.07367980480194092, 0.06962886452674866, 0.06568413972854614, - 0.06184655427932739, 0.0581170916557312, 0.0544966459274292, 0.05098611116409302, - 0.04758638143539429, 0.044298261404037476, 0.04112258553504944, 0.038060128688812256, - 0.03511166572570801, 0.03227788209915161, 0.02955952286720276, 0.02695724368095398, - 0.024471670389175415, 0.02210339903831482, 0.01985308527946472, 0.017721205949783325, - 0.015708357095718384, 0.0138150155544281, 0.012041598558425903, 0.010388582944869995, - 0.008856356143951416, 0.007445335388183594, 0.006155818700790405, 0.004988163709640503, - 0.003942638635635376, 0.0030195116996765137, 0.0022190213203430176, 0.0015413165092468262, - 0.000986635684967041, 0.0005550682544708252, 0.0002467334270477295, 6.16908073425293e-05}; + std::vector hann(frame_size); + for (int i = 0; i < frame_size; i++) { + hann[i] = 0.5*(1.0 - cosf((2.0*M_PI*i)/(frame_size))); + } // Calculate the length of padding int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30; @@ -3122,8 +3026,8 @@ int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, st } // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good) -int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads, bool debug) { - return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads, debug); +int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) { + return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads, false); } // same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2