Skip to content

Commit

Permalink
add language tag from cocina to sqs message
Browse files Browse the repository at this point in the history
  • Loading branch information
peetucket committed Nov 26, 2024
1 parent d6f842f commit 0e2520d
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 10 deletions.
5 changes: 5 additions & 0 deletions lib/dor/text_extraction/speech_to_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ def output_location
"#{job_id}/output"
end

# look in resource structural metadata for a given filename and return it's cocina language (or nil if no language is set)
def language_tag(filename)
stt_files.find { |file| file.filename == filename }&.languageTag
end

private

# iterate through cocina structural contains and return all File objects for files that need to be stt'd
Expand Down
9 changes: 7 additions & 2 deletions lib/robots/dor_repo/speech_to_text/stt_create.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,23 @@ def message_body
end

def job_id
@job_id ||= Dor::TextExtraction::SpeechToText.new(cocina_object:).job_id
stt.job_id
end

# array of media files in the bucket folder for this job (excluding s3 folders)
def media
aws_provider.client.list_objects(bucket: aws_provider.bucket_name, prefix: job_id).contents.map(&:key).reject { |key| key.end_with?('/') }
filenames = aws_provider.client.list_objects(bucket: aws_provider.bucket_name, prefix: job_id).contents.map(&:key).reject { |key| key.end_with?('/') }
filenames.map { |filename| { name: filename, options: { language: stt.language_tag(File.basename(filename)) } } }
end

# pulled from config, could later be overriden by settings in the workflow context
def whisper_options
Settings.speech_to_text.whisper.to_h
end

def stt
@stt ||= Dor::TextExtraction::SpeechToText.new(cocina_object:)
end
end
end
end
Expand Down
33 changes: 31 additions & 2 deletions spec/lib/dor/text_extraction/speech_to_text_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
let(:second_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [mp4_file, mp4_file_not_shelved, mp4_file_not_preserved]) }
let(:third_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file2]) }
let(:m4a_file) { build_file('file1.m4a') }
let(:mp4_file) { build_file('file1.mp4') }
let(:mp4_file) { build_file('file1.mp4', language_tag: 'es') }
let(:mp4_file_not_shelved) { build_file('file2.mp4', shelve: false) }
let(:mp4_file_not_preserved) { build_file('file3.mp4', preserve: false) }
let(:text_file) { build_file('file1.txt') }
Expand All @@ -45,7 +45,7 @@

context 'when the object has no files that can be STTed' do
let(:first_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file]) }
let(:second_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file, text_file]) }
let(:second_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file, text_file2]) }

it 'returns false' do
expect(stt.possible?).to be false
Expand Down Expand Up @@ -88,6 +88,35 @@
end
end

describe '#language_tag' do
context 'when the file cannot be found' do
let(:filename) { 'bogus.mp4' }

it 'returns nil' do
expect(stt.filenames_to_stt).not_to include(filename)
expect(stt.language_tag(filename)).to be_nil
end
end

context 'when the file is found and there is no language tag in cocina' do
let(:filename) { 'file1.m4a' }

it 'returns nil' do
expect(stt.filenames_to_stt).to include(filename)
expect(stt.language_tag(filename)).to be_nil
end
end

context 'when the file is found and there is a language tag in cocina' do
let(:filename) { 'file1.mp4' }

it 'returns the language tag' do
expect(stt.filenames_to_stt).to include(filename)
expect(stt.language_tag(filename)).to eq 'es'
end
end
end

describe '#cleanup' do
let(:client) { instance_double(Aws::S3::Client, list_objects:) }
let(:list_objects) { instance_double(Aws::S3::Types::ListObjectsOutput, contents: [m4a_object, mp4_object]) }
Expand Down
11 changes: 7 additions & 4 deletions spec/robots/dor_repo/speech_to_text/stt_create_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
let(:robot) { described_class.new }
let(:aws_client) { instance_double(Aws::SQS::Client) }
let(:aws_s3_client) { instance_double(Aws::S3::Client) }
let(:stt) { instance_double(Dor::TextExtraction::SpeechToText, job_id:, filenames_to_stt: ['file1.mov', 'file2.mp3']) }
let(:filenames_to_stt) { ['file1.mov', 'file2.mp3'] }
let(:stt) { instance_double(Dor::TextExtraction::SpeechToText, job_id:, filenames_to_stt:) }
let(:cocina_model) { build(:dro, id: druid).new(structural: {}, type: object_type, access: { view: 'world' }) }
let(:object_type) { Cocina::Models::ObjectType.media }
let(:dsa_object_client) do
Expand All @@ -23,10 +24,10 @@
instance_double(Dor::Workflow::Response::Process, lane_id: 'lane1', context: { 'runSpeechToText' => true })
end
let(:job_id) { "#{bare_druid}-v1" }
let(:media) { ["#{job_id}/file1.mov", "#{job_id}/file2.mp3"] }
let(:media) { [{ name: "#{job_id}/#{filenames_to_stt[0]}", options: { language: 'en' } }, { name: "#{job_id}/#{filenames_to_stt[1]}", options: { language: 'es' } }] }
let(:list_objects) { instance_double(Aws::S3::Types::ListObjectsOutput, contents: [mov_object, mp3_object]) }
let(:mov_object) { instance_double(Aws::S3::Types::Object, key: media[0]) }
let(:mp3_object) { instance_double(Aws::S3::Types::Object, key: media[1]) }
let(:mov_object) { instance_double(Aws::S3::Types::Object, key: media[0][:name]) }
let(:mp3_object) { instance_double(Aws::S3::Types::Object, key: media[1][:name]) }

before do
allow(Aws::S3::Client).to receive(:new).and_return(aws_s3_client)
Expand All @@ -35,6 +36,8 @@
allow(Dor::TextExtraction::SpeechToText).to receive(:new).and_return(stt)
allow(LyberCore::WorkflowClientFactory).to receive(:build).and_return(workflow_client)
allow(aws_s3_client).to receive(:list_objects).and_return(list_objects)
allow(stt).to receive(:language_tag).with(filenames_to_stt[0]).and_return('en')
allow(stt).to receive(:language_tag).with(filenames_to_stt[1]).and_return('es')
end

context 'when the message is sent successfully' do
Expand Down
6 changes: 4 additions & 2 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@ def clone_test_input(destination)
system "rsync -rqOlt --delete #{source}/ #{destination}/"
end

def build_file(filename, preserve: true, shelve: true, corrected: false, sdr_generated: false)
# rubocop:disable Metrics/ParameterLists
def build_file(filename, preserve: true, shelve: true, corrected: false, sdr_generated: false, language_tag: nil)
extension = File.extname(filename)
mimetype = { '.pdf' => 'application/pdf', '.tif' => 'image/tiff', '.jpg' => 'image/jpeg', '.txt' => 'text/plain',
'.m4a' => 'audio/mp4', '.mp4' => 'video/mp4', '.vtt' => 'text/vtt', '.xml' => 'application/xml' }
sdr_value = instance_double(Cocina::Models::FileAdministrative, sdrPreserve: preserve, shelve:)
instance_double(Cocina::Models::File, administrative: sdr_value, hasMimeType: mimetype[extension],
instance_double(Cocina::Models::File, administrative: sdr_value, hasMimeType: mimetype[extension], languageTag: language_tag,
filename:, correctedForAccessibility: corrected, sdrGeneratedText: sdr_generated)
end
# rubocop:enable Metrics/ParameterLists

0 comments on commit 0e2520d

Please sign in to comment.