From 2276dbcf6f8be37e2398f743158d03412f9b3ec5 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Fri, 8 Nov 2024 15:05:03 -0500 Subject: [PATCH] create separate files for aligning words, don't just use the training data. --- machine/jobs/word_alignment_build_job.py | 7 ++++++- machine/jobs/word_alignment_file_service.py | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/machine/jobs/word_alignment_build_job.py b/machine/jobs/word_alignment_build_job.py index c9aa994..d3769c8 100644 --- a/machine/jobs/word_alignment_build_job.py +++ b/machine/jobs/word_alignment_build_job.py @@ -50,7 +50,12 @@ def run( check_canceled() logger.info("Generating alignments") - self._batch_inference(parallel_corpus, progress_reporter, check_canceled) + + source_word_alignment_corpus = self._word_alignment_file_service.create_source_word_alignment_corpus() + target_word_alignment_corpus = self._word_alignment_file_service.create_target_word_alignment_corpus() + word_alignment_parallel_corpus: ParallelTextCorpus = source_word_alignment_corpus.align_rows(target_word_alignment_corpus) + + self._batch_inference(word_alignment_parallel_corpus, progress_reporter, check_canceled) self._save_model() return train_corpus_size diff --git a/machine/jobs/word_alignment_file_service.py b/machine/jobs/word_alignment_file_service.py index 2d8a548..1c78c8a 100644 --- a/machine/jobs/word_alignment_file_service.py +++ b/machine/jobs/word_alignment_file_service.py @@ -15,11 +15,15 @@ def __init__( config: Any, source_filename: str = "train.src.txt", target_filename: str = "train.trg.txt", + source_word_alignment_filename: str = "align_words.src.json", + target_word_alignment_filename: str = "align_words.trg.json", word_alignment_filename: str = "word_alignments.json", ) -> None: self._source_filename = source_filename self._target_filename = target_filename + self._source_word_alignment_filename = source_word_alignment_filename + self._target_word_alignment_filename = target_word_alignment_filename self._word_alignment_filename = word_alignment_filename self.shared_file_service: SharedFileServiceBase = get_shared_file_service(type, config) @@ -34,12 +38,28 @@ def create_target_corpus(self) -> TextCorpus: self.shared_file_service.download_file(f"{self.shared_file_service.build_path}/{self._target_filename}") ) + def create_source_word_alignment_corpus(self) -> TextCorpus: + return TextFileTextCorpus( + self.shared_file_service.download_file(f"{self.shared_file_service.build_path}/{self._source_word_alignment_filename}") + ) + + def create_target_word_alignment_corpus(self) -> TextCorpus: + return TextFileTextCorpus( + self.shared_file_service.download_file(f"{self.shared_file_service.build_path}/{self._target_word_alignment_filename}") + ) + def exists_source_corpus(self) -> bool: return self.shared_file_service._exists_file(f"{self.shared_file_service.build_path}/{self._source_filename}") def exists_target_corpus(self) -> bool: return self.shared_file_service._exists_file(f"{self.shared_file_service.build_path}/{self._target_filename}") + def exists_source_word_alignment_corpus(self) -> bool: + return self.shared_file_service._exists_file(f"{self.shared_file_service.build_path}/{self._source_word_alignment_filename}") + + def exists_target_word_alignment_corpus(self) -> bool: + return self.shared_file_service._exists_file(f"{self.shared_file_service.build_path}/{self._target_word_alignment_filename}") + def save_model(self, model_path: Path, destination: str) -> None: self.shared_file_service.upload_path(model_path, destination)