diff --git a/lib/pocketsphinx.rb b/lib/pocketsphinx.rb index 0125d99..b9b0390 100644 --- a/lib/pocketsphinx.rb +++ b/lib/pocketsphinx.rb @@ -3,6 +3,7 @@ require "pocketsphinx/version" # Pocketsphinx FFI API +require "pocketsphinx/api/sphinxbase/cmn" require "pocketsphinx/api/sphinxbase" require "pocketsphinx/api/sphinxad" require "pocketsphinx/api/pocketsphinx" @@ -22,6 +23,7 @@ require "pocketsphinx/audio_file" require "pocketsphinx/microphone" require "pocketsphinx/decoder" +require "pocketsphinx/cmn_decoder" require "pocketsphinx/speech_recognizer" require "pocketsphinx/live_speech_recognizer" require "pocketsphinx/audio_file_speech_recognizer" diff --git a/lib/pocketsphinx/api/pocketsphinx.rb b/lib/pocketsphinx/api/pocketsphinx.rb index 9e3bded..e0c061e 100644 --- a/lib/pocketsphinx/api/pocketsphinx.rb +++ b/lib/pocketsphinx/api/pocketsphinx.rb @@ -26,6 +26,7 @@ def self.ps_init(*args) attach_function :ps_unset_search, [:decoder, :string], :int attach_function :ps_get_search, [:decoder], :string attach_function :ps_set_search, [:decoder, :string], :int + attach_function :ps_get_feat, [:decoder], :pointer typedef :pointer, :seg_iter @@ -35,6 +36,11 @@ def self.ps_init(*args) attach_function :ps_seg_frames, [:seg_iter, :pointer, :pointer], :void attach_function :ps_seg_prob, [:seg_iter, :pointer, :pointer, :pointer], :int32 attach_function :ps_seg_free, [:seg_iter], :void + + def self.get_cmn_values(ps_decoder) + feature = Sphinxbase::Feature.new(ps_get_feat(ps_decoder)) + feature[:cmn_struct][:cmn_mean].get_array_of_float32(0, feature[:cmn_struct][:veclen]) + end end end end diff --git a/lib/pocketsphinx/api/sphinxbase.rb b/lib/pocketsphinx/api/sphinxbase.rb index 6e4d5cf..e492b64 100644 --- a/lib/pocketsphinx/api/sphinxbase.rb +++ b/lib/pocketsphinx/api/sphinxbase.rb @@ -4,6 +4,8 @@ module Sphinxbase extend FFI::Library ffi_lib "libsphinxbase" + include Cmn + class Argument < FFI::Struct layout :name, :string, :type, :int, diff --git a/lib/pocketsphinx/api/sphinxbase/cmn.rb b/lib/pocketsphinx/api/sphinxbase/cmn.rb new file mode 100644 index 0000000..20c4573 --- /dev/null +++ b/lib/pocketsphinx/api/sphinxbase/cmn.rb @@ -0,0 +1,40 @@ +module Pocketsphinx + module API + module Sphinxbase + module Cmn + extend FFI::Library + + enum :cmn_type, [:none, 0, :current, :prior] + enum :agc_type, [:none, 0, :max, :emax, :noise] + + class CmnData < FFI::Struct + layout :cmn_mean, :pointer, + :cmn_var, :pointer, + :sum, :pointer, + :nframe, :int32, + :veclen, :int32 + end + + class Feature < FFI::Struct + layout :refcount, :int, + :name, :string, + :cepsize, :int32, + :n_stream, :int32, + :stream_len, :pointer, + :window_size, :int32, + :n_sv, :int32, + :sv_len, :pointer, + :subvecs, :pointer, + :mfcc_t, :pointer, + :sv_dim, :int32, + :cmn, :cmn_type, + :varnorm, :int32, + :agc, :agc_type, + :compute_feat, :pointer, + :cmn_struct, CmnData.ptr, + :agc_struct, :pointer + end + end + end + end +end diff --git a/lib/pocketsphinx/cmn_decoder.rb b/lib/pocketsphinx/cmn_decoder.rb new file mode 100644 index 0000000..7d1a861 --- /dev/null +++ b/lib/pocketsphinx/cmn_decoder.rb @@ -0,0 +1,29 @@ +module Pocketsphinx + class CMNDecoder < Decoder + CMN_TOLERANCE_DEFAULT = 20 + + attr_writer :cmn_tolerance + + def cmn_tolerance + @cmn_tolerance || CMN_TOLERANCE_DEFAULT + end + + def decode_raw(audio_file, max_samples = 2048) + repeat_if_cmn_sum_exceeds { super } + end + + private + + def repeat_if_cmn_sum_exceeds(tolerance = cmn_tolerance) + before = cmn_values + result = yield + after = cmn_values + + cmn_sum(before, after) > tolerance ? yield : result + end + + def cmn_sum(before, after) + before.zip(after).inject(0) { |sum, a| sum + (a.last - a.first).abs } + end + end +end diff --git a/lib/pocketsphinx/decoder.rb b/lib/pocketsphinx/decoder.rb index 39ac537..0a4542b 100644 --- a/lib/pocketsphinx/decoder.rb +++ b/lib/pocketsphinx/decoder.rb @@ -66,6 +66,7 @@ def decode(audio_path_or_file, max_samples = 2048) # @param [IO] audio_file The raw audio stream to decode as a single utterance # @param [Fixnum] max_samples The maximum samples to process from the stream on each iteration def decode_raw(audio_file, max_samples = 2048) + audio_file.rewind start_utterance FFI::MemoryPointer.new(:int16, max_samples) do |buffer| @@ -147,6 +148,10 @@ def words words end + def cmn_values + ps_api.get_cmn_values(ps_decoder) + end + # Adds new search using JSGF model. # # Convenience method to parse JSGF model from string and create a search.