sul-dlss · peetucket · Nov 26, 2024 · Nov 27, 2024 · peetucket · Nov 26, 2024
diff --git a/lib/dor/text_extraction/speech_to_text.rb b/lib/dor/text_extraction/speech_to_text.rb
@@ -71,6 +71,12 @@ def output_location
         "#{job_id}/output"
       end
 
+      # given a filename, look in the list of files that can be sent for speech to text, examine the cocina structural
+      #  and return the languageTag for the file (or nil if no language is set)
+      def language_tag(filename)
+        stt_files.find { |file| file.filename == filename }&.languageTag
+      end
+
       private
 
       # iterate through cocina structural contains and return all File objects for files that need to be stt'd

diff --git a/lib/robots/dor_repo/speech_to_text/stt_create.rb b/lib/robots/dor_repo/speech_to_text/stt_create.rb
@@ -43,18 +43,23 @@ def message_body
         end
 
         def job_id
-          @job_id ||= Dor::TextExtraction::SpeechToText.new(cocina_object:).job_id
+          stt.job_id
         end
 
         # array of media files in the bucket folder for this job (excluding s3 folders)
         def media
-          aws_provider.client.list_objects(bucket: aws_provider.bucket_name, prefix: job_id).contents.map(&:key).reject { |key| key.end_with?('/') }
+          filenames = aws_provider.client.list_objects(bucket: aws_provider.bucket_name, prefix: job_id).contents.map(&:key).reject { |key| key.end_with?('/') }
+          filenames.map { |filename| { name: filename, options: { language: stt.language_tag(File.basename(filename)) } } }
         end
 
         # pulled from config, could later be overriden by settings in the workflow context
         def whisper_options
           Settings.speech_to_text.whisper.to_h
         end
+
+        def stt
+          @stt ||= Dor::TextExtraction::SpeechToText.new(cocina_object:)
+        end
       end
     end
   end

diff --git a/spec/lib/dor/text_extraction/speech_to_text_spec.rb b/spec/lib/dor/text_extraction/speech_to_text_spec.rb
@@ -19,7 +19,7 @@
   let(:second_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [mp4_file, mp4_file_not_shelved, mp4_file_not_preserved]) }
   let(:third_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file2]) }
   let(:m4a_file) { build_file('file1.m4a') }
-  let(:mp4_file) { build_file('file1.mp4') }
+  let(:mp4_file) { build_file('file1.mp4', language_tag: 'es') }
   let(:mp4_file_not_shelved) { build_file('file2.mp4', shelve: false) }
   let(:mp4_file_not_preserved) { build_file('file3.mp4', preserve: false) }
   let(:text_file) { build_file('file1.txt') }
@@ -45,7 +45,7 @@
 
       context 'when the object has no files that can be STTed' do
         let(:first_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file]) }
-        let(:second_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file, text_file]) }
+        let(:second_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file, text_file2]) }
 
         it 'returns false' do
           expect(stt.possible?).to be false
@@ -88,6 +88,35 @@
     end
   end
 
+  describe '#language_tag' do
+    context 'when the file cannot be found' do
+      let(:filename) { 'bogus.mp4' }
+
+      it 'returns nil' do
+        expect(stt.filenames_to_stt).not_to include(filename)
+        expect(stt.language_tag(filename)).to be_nil
+      end
+    end
+
+    context 'when the file is found and there is no language tag in cocina' do
+      let(:filename) { 'file1.m4a' }
+
+      it 'returns nil' do
+        expect(stt.filenames_to_stt).to include(filename)
+        expect(stt.language_tag(filename)).to be_nil
+      end
+    end
+
+    context 'when the file is found and there is a language tag in cocina' do
+      let(:filename) { 'file1.mp4' }
+
+      it 'returns the language tag' do
+        expect(stt.filenames_to_stt).to include(filename)
+        expect(stt.language_tag(filename)).to eq 'es'
+      end
+    end
+  end
+
   describe '#cleanup' do
     let(:client) { instance_double(Aws::S3::Client, list_objects:) }
     let(:list_objects) { instance_double(Aws::S3::Types::ListObjectsOutput, contents: [m4a_object, mp4_object]) }

diff --git a/spec/robots/dor_repo/speech_to_text/stt_create_spec.rb b/spec/robots/dor_repo/speech_to_text/stt_create_spec.rb
@@ -10,7 +10,8 @@
   let(:robot) { described_class.new }
   let(:aws_client) { instance_double(Aws::SQS::Client) }
   let(:aws_s3_client) { instance_double(Aws::S3::Client) }
-  let(:stt) { instance_double(Dor::TextExtraction::SpeechToText, job_id:, filenames_to_stt: ['file1.mov', 'file2.mp3']) }
+  let(:filenames_to_stt) { ['file1.mov', 'file2.mp3'] }
+  let(:stt) { instance_double(Dor::TextExtraction::SpeechToText, job_id:, filenames_to_stt:) }
   let(:cocina_model) { build(:dro, id: druid).new(structural: {}, type: object_type, access: { view: 'world' }) }
   let(:object_type) { Cocina::Models::ObjectType.media }
   let(:dsa_object_client) do
@@ -23,10 +24,10 @@
     instance_double(Dor::Workflow::Response::Process, lane_id: 'lane1', context: { 'runSpeechToText' => true })
   end
   let(:job_id) { "#{bare_druid}-v1" }
-  let(:media) { ["#{job_id}/file1.mov", "#{job_id}/file2.mp3"] }
+  let(:media) { [{ name: "#{job_id}/#{filenames_to_stt[0]}", options: { language: 'en' } }, { name: "#{job_id}/#{filenames_to_stt[1]}", options: { language: 'es' } }] }
   let(:list_objects) { instance_double(Aws::S3::Types::ListObjectsOutput, contents: [mov_object, mp3_object]) }
-  let(:mov_object) { instance_double(Aws::S3::Types::Object, key: media[0]) }
-  let(:mp3_object) { instance_double(Aws::S3::Types::Object, key: media[1]) }
+  let(:mov_object) { instance_double(Aws::S3::Types::Object, key: media[0][:name]) }
+  let(:mp3_object) { instance_double(Aws::S3::Types::Object, key: media[1][:name]) }
 
   before do
     allow(Aws::S3::Client).to receive(:new).and_return(aws_s3_client)
@@ -35,6 +36,8 @@
     allow(Dor::TextExtraction::SpeechToText).to receive(:new).and_return(stt)
     allow(LyberCore::WorkflowClientFactory).to receive(:build).and_return(workflow_client)
     allow(aws_s3_client).to receive(:list_objects).and_return(list_objects)
+    allow(stt).to receive(:language_tag).with(filenames_to_stt[0]).and_return('en')
+    allow(stt).to receive(:language_tag).with(filenames_to_stt[1]).and_return('es')
   end
 
   context 'when the message is sent successfully' do

diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
@@ -40,11 +40,13 @@ def clone_test_input(destination)
   system "rsync -rqOlt --delete #{source}/ #{destination}/"
 end
 
-def build_file(filename, preserve: true, shelve: true, corrected: false, sdr_generated: false)
+# rubocop:disable Metrics/ParameterLists
+def build_file(filename, preserve: true, shelve: true, corrected: false, sdr_generated: false, language_tag: nil)
   extension = File.extname(filename)
   mimetype = { '.pdf' => 'application/pdf', '.tif' => 'image/tiff', '.jpg' => 'image/jpeg', '.txt' => 'text/plain',
                '.m4a' => 'audio/mp4', '.mp4' => 'video/mp4', '.vtt' => 'text/vtt', '.xml' => 'application/xml' }
   sdr_value = instance_double(Cocina::Models::FileAdministrative, sdrPreserve: preserve, shelve:)
-  instance_double(Cocina::Models::File, administrative: sdr_value, hasMimeType: mimetype[extension],
+  instance_double(Cocina::Models::File, administrative: sdr_value, hasMimeType: mimetype[extension], languageTag: language_tag,
                                         filename:, correctedForAccessibility: corrected, sdrGeneratedText: sdr_generated)
 end
+# rubocop:enable Metrics/ParameterLists