-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[HOLD] add language tag from cocina to sqs message #1428
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,18 +43,23 @@ def message_body | |
end | ||
|
||
def job_id | ||
@job_id ||= Dor::TextExtraction::SpeechToText.new(cocina_object:).job_id | ||
stt.job_id | ||
end | ||
|
||
# array of media files in the bucket folder for this job (excluding s3 folders) | ||
def media | ||
aws_provider.client.list_objects(bucket: aws_provider.bucket_name, prefix: job_id).contents.map(&:key).reject { |key| key.end_with?('/') } | ||
filenames = aws_provider.client.list_objects(bucket: aws_provider.bucket_name, prefix: job_id).contents.map(&:key).reject { |key| key.end_with?('/') } | ||
filenames.map { |filename| { name: filename, options: { language: stt.language_tag(File.basename(filename)) } } } | ||
end | ||
|
||
# pulled from config, could later be overriden by settings in the workflow context | ||
def whisper_options | ||
Settings.speech_to_text.whisper.to_h | ||
end | ||
|
||
def stt | ||
@stt ||= Dor::TextExtraction::SpeechToText.new(cocina_object:) | ||
end | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cache this in the object so it can be used in our updated |
||
end | ||
end | ||
end | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,7 @@ | |
let(:second_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [mp4_file, mp4_file_not_shelved, mp4_file_not_preserved]) } | ||
let(:third_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file2]) } | ||
let(:m4a_file) { build_file('file1.m4a') } | ||
let(:mp4_file) { build_file('file1.mp4') } | ||
let(:mp4_file) { build_file('file1.mp4', language_tag: 'es') } | ||
let(:mp4_file_not_shelved) { build_file('file2.mp4', shelve: false) } | ||
let(:mp4_file_not_preserved) { build_file('file3.mp4', preserve: false) } | ||
let(:text_file) { build_file('file1.txt') } | ||
|
@@ -45,7 +45,7 @@ | |
|
||
context 'when the object has no files that can be STTed' do | ||
let(:first_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file]) } | ||
let(:second_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file, text_file]) } | ||
let(:second_fileset_structural) { instance_double(Cocina::Models::FileSetStructural, contains: [text_file, text_file2]) } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. small typo from a previous PR, turned out to have no impact on the test, but just fixing because |
||
|
||
it 'returns false' do | ||
expect(stt.possible?).to be false | ||
|
@@ -88,6 +88,35 @@ | |
end | ||
end | ||
|
||
describe '#language_tag' do | ||
context 'when the file cannot be found' do | ||
let(:filename) { 'bogus.mp4' } | ||
|
||
it 'returns nil' do | ||
expect(stt.filenames_to_stt).not_to include(filename) | ||
expect(stt.language_tag(filename)).to be_nil | ||
end | ||
end | ||
|
||
context 'when the file is found and there is no language tag in cocina' do | ||
let(:filename) { 'file1.m4a' } | ||
|
||
it 'returns nil' do | ||
expect(stt.filenames_to_stt).to include(filename) | ||
expect(stt.language_tag(filename)).to be_nil | ||
end | ||
end | ||
|
||
context 'when the file is found and there is a language tag in cocina' do | ||
let(:filename) { 'file1.mp4' } | ||
|
||
it 'returns the language tag' do | ||
expect(stt.filenames_to_stt).to include(filename) | ||
expect(stt.language_tag(filename)).to eq 'es' | ||
end | ||
end | ||
end | ||
|
||
describe '#cleanup' do | ||
let(:client) { instance_double(Aws::S3::Client, list_objects:) } | ||
let(:list_objects) { instance_double(Aws::S3::Types::ListObjectsOutput, contents: [m4a_object, mp4_object]) } | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
question ... do we want the default value to pass to whisper to be nil or
en
? this makes it nil, but we could make iten
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Preferably there would be no language parameter passed to the service if it is not known.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let me see if I can make that adjustment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
changed - so ready for testing when the new container is ready