Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[HOLD] add language tag from cocina to sqs message #1428

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions lib/dor/text_extraction/speech_to_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ def output_location
"#{job_id}/output"
end

# given a filename, look in the list of files that can be sent for speech to text, examine the cocina structural
# and return the languageTag for the file (or nil if no language is set)
def language_tag(filename)
stt_files.find { |file| file.filename == filename }&.languageTag
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question ... do we want the default value to pass to whisper to be nil or en ? this makes it nil, but we could make it en

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Preferably there would be no language parameter passed to the service if it is not known.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me see if I can make that adjustment.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed - so ready for testing when the new container is ready

end

private

# iterate through cocina structural contains and return all File objects for files that need to be stt'd
Expand Down
9 changes: 7 additions & 2 deletions lib/robots/dor_repo/speech_to_text/stt_create.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,23 @@ def message_body
end

def job_id
@job_id ||= Dor::TextExtraction::SpeechToText.new(cocina_object:).job_id
stt.job_id
end

# array of media files in the bucket folder for this job (excluding s3 folders)
def media
aws_provider.client.list_objects(bucket: aws_provider.bucket_name, prefix: job_id).contents.map(&:key).reject { |key| key.end_with?('/') }
filenames = aws_provider.client.list_objects(bucket: aws_provider.bucket_name, prefix: job_id).contents.map(&:key).reject { |key| key.end_with?('/') }
filenames.map { |filename| { name: filename, options: { language: stt.language_tag(File.basename(filename)) } } }
end

# pulled from config, could later be overriden by settings in the workflow context
def whisper_options
Settings.speech_to_text.whisper.to_h
end

def stt
@stt ||= Dor::TextExtraction::SpeechToText.new(cocina_object:)
end
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cache this in the object so it can be used in our updated media method without re-instantiating it.

end
end
end
Expand Down
33 changes: 31 additions & 2 deletions spec/lib/dor/text_extraction/speech_to_text_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
let(:second_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [mp4_file, mp4_file_not_shelved, mp4_file_not_preserved]) }
let(:third_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file2]) }
let(:m4a_file) { build_file('file1.m4a') }
let(:mp4_file) { build_file('file1.mp4') }
let(:mp4_file) { build_file('file1.mp4', language_tag: 'es') }
let(:mp4_file_not_shelved) { build_file('file2.mp4', shelve: false) }
let(:mp4_file_not_preserved) { build_file('file3.mp4', preserve: false) }
let(:text_file) { build_file('file1.txt') }
Expand All @@ -45,7 +45,7 @@

context 'when the object has no files that can be STTed' do
let(:first_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file]) }
let(:second_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file, text_file]) }
let(:second_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file, text_file2]) }
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

small typo from a previous PR, turned out to have no impact on the test, but just fixing because


it 'returns false' do
expect(stt.possible?).to be false
Expand Down Expand Up @@ -88,6 +88,35 @@
end
end

describe '#language_tag' do
context 'when the file cannot be found' do
let(:filename) { 'bogus.mp4' }

it 'returns nil' do
expect(stt.filenames_to_stt).not_to include(filename)
expect(stt.language_tag(filename)).to be_nil
end
end

context 'when the file is found and there is no language tag in cocina' do
let(:filename) { 'file1.m4a' }

it 'returns nil' do
expect(stt.filenames_to_stt).to include(filename)
expect(stt.language_tag(filename)).to be_nil
end
end

context 'when the file is found and there is a language tag in cocina' do
let(:filename) { 'file1.mp4' }

it 'returns the language tag' do
expect(stt.filenames_to_stt).to include(filename)
expect(stt.language_tag(filename)).to eq 'es'
end
end
end

describe '#cleanup' do
let(:client) { instance_double(Aws::S3::Client, list_objects:) }
let(:list_objects) { instance_double(Aws::S3::Types::ListObjectsOutput, contents: [m4a_object, mp4_object]) }
Expand Down
11 changes: 7 additions & 4 deletions spec/robots/dor_repo/speech_to_text/stt_create_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
let(:robot) { described_class.new }
let(:aws_client) { instance_double(Aws::SQS::Client) }
let(:aws_s3_client) { instance_double(Aws::S3::Client) }
let(:stt) { instance_double(Dor::TextExtraction::SpeechToText, job_id:, filenames_to_stt: ['file1.mov', 'file2.mp3']) }
let(:filenames_to_stt) { ['file1.mov', 'file2.mp3'] }
let(:stt) { instance_double(Dor::TextExtraction::SpeechToText, job_id:, filenames_to_stt:) }
let(:cocina_model) { build(:dro, id: druid).new(structural: {}, type: object_type, access: { view: 'world' }) }
let(:object_type) { Cocina::Models::ObjectType.media }
let(:dsa_object_client) do
Expand All @@ -23,10 +24,10 @@
instance_double(Dor::Workflow::Response::Process, lane_id: 'lane1', context: { 'runSpeechToText' => true })
end
let(:job_id) { "#{bare_druid}-v1" }
let(:media) { ["#{job_id}/file1.mov", "#{job_id}/file2.mp3"] }
let(:media) { [{ name: "#{job_id}/#{filenames_to_stt[0]}", options: { language: 'en' } }, { name: "#{job_id}/#{filenames_to_stt[1]}", options: { language: 'es' } }] }
let(:list_objects) { instance_double(Aws::S3::Types::ListObjectsOutput, contents: [mov_object, mp3_object]) }
let(:mov_object) { instance_double(Aws::S3::Types::Object, key: media[0]) }
let(:mp3_object) { instance_double(Aws::S3::Types::Object, key: media[1]) }
let(:mov_object) { instance_double(Aws::S3::Types::Object, key: media[0][:name]) }
let(:mp3_object) { instance_double(Aws::S3::Types::Object, key: media[1][:name]) }

before do
allow(Aws::S3::Client).to receive(:new).and_return(aws_s3_client)
Expand All @@ -35,6 +36,8 @@
allow(Dor::TextExtraction::SpeechToText).to receive(:new).and_return(stt)
allow(LyberCore::WorkflowClientFactory).to receive(:build).and_return(workflow_client)
allow(aws_s3_client).to receive(:list_objects).and_return(list_objects)
allow(stt).to receive(:language_tag).with(filenames_to_stt[0]).and_return('en')
allow(stt).to receive(:language_tag).with(filenames_to_stt[1]).and_return('es')
end

context 'when the message is sent successfully' do
Expand Down
6 changes: 4 additions & 2 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@ def clone_test_input(destination)
system "rsync -rqOlt --delete #{source}/ #{destination}/"
end

def build_file(filename, preserve: true, shelve: true, corrected: false, sdr_generated: false)
# rubocop:disable Metrics/ParameterLists
def build_file(filename, preserve: true, shelve: true, corrected: false, sdr_generated: false, language_tag: nil)
extension = File.extname(filename)
mimetype = { '.pdf' => 'application/pdf', '.tif' => 'image/tiff', '.jpg' => 'image/jpeg', '.txt' => 'text/plain',
'.m4a' => 'audio/mp4', '.mp4' => 'video/mp4', '.vtt' => 'text/vtt', '.xml' => 'application/xml' }
sdr_value = instance_double(Cocina::Models::FileAdministrative, sdrPreserve: preserve, shelve:)
instance_double(Cocina::Models::File, administrative: sdr_value, hasMimeType: mimetype[extension],
instance_double(Cocina::Models::File, administrative: sdr_value, hasMimeType: mimetype[extension], languageTag: language_tag,
filename:, correctedForAccessibility: corrected, sdrGeneratedText: sdr_generated)
end
# rubocop:enable Metrics/ParameterLists