Skip to content

Commit

Permalink
create separate files for aligning words, don't just use the training…
Browse files Browse the repository at this point in the history
… data.
  • Loading branch information
johnml1135 committed Nov 11, 2024
1 parent 0fb9518 commit 2276dbc
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 1 deletion.
7 changes: 6 additions & 1 deletion machine/jobs/word_alignment_build_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,12 @@ def run(
check_canceled()

logger.info("Generating alignments")
self._batch_inference(parallel_corpus, progress_reporter, check_canceled)

source_word_alignment_corpus = self._word_alignment_file_service.create_source_word_alignment_corpus()
target_word_alignment_corpus = self._word_alignment_file_service.create_target_word_alignment_corpus()
word_alignment_parallel_corpus: ParallelTextCorpus = source_word_alignment_corpus.align_rows(target_word_alignment_corpus)

self._batch_inference(word_alignment_parallel_corpus, progress_reporter, check_canceled)

self._save_model()
return train_corpus_size
Expand Down
20 changes: 20 additions & 0 deletions machine/jobs/word_alignment_file_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@ def __init__(
config: Any,
source_filename: str = "train.src.txt",
target_filename: str = "train.trg.txt",
source_word_alignment_filename: str = "align_words.src.json",
target_word_alignment_filename: str = "align_words.trg.json",
word_alignment_filename: str = "word_alignments.json",
) -> None:

self._source_filename = source_filename
self._target_filename = target_filename
self._source_word_alignment_filename = source_word_alignment_filename
self._target_word_alignment_filename = target_word_alignment_filename
self._word_alignment_filename = word_alignment_filename

self.shared_file_service: SharedFileServiceBase = get_shared_file_service(type, config)
Expand All @@ -34,12 +38,28 @@ def create_target_corpus(self) -> TextCorpus:
self.shared_file_service.download_file(f"{self.shared_file_service.build_path}/{self._target_filename}")
)

def create_source_word_alignment_corpus(self) -> TextCorpus:
return TextFileTextCorpus(
self.shared_file_service.download_file(f"{self.shared_file_service.build_path}/{self._source_word_alignment_filename}")
)

def create_target_word_alignment_corpus(self) -> TextCorpus:
return TextFileTextCorpus(
self.shared_file_service.download_file(f"{self.shared_file_service.build_path}/{self._target_word_alignment_filename}")
)

def exists_source_corpus(self) -> bool:
return self.shared_file_service._exists_file(f"{self.shared_file_service.build_path}/{self._source_filename}")

def exists_target_corpus(self) -> bool:
return self.shared_file_service._exists_file(f"{self.shared_file_service.build_path}/{self._target_filename}")

def exists_source_word_alignment_corpus(self) -> bool:
return self.shared_file_service._exists_file(f"{self.shared_file_service.build_path}/{self._source_word_alignment_filename}")

def exists_target_word_alignment_corpus(self) -> bool:
return self.shared_file_service._exists_file(f"{self.shared_file_service.build_path}/{self._target_word_alignment_filename}")

def save_model(self, model_path: Path, destination: str) -> None:
self.shared_file_service.upload_path(model_path, destination)

Expand Down

0 comments on commit 2276dbc

Please sign in to comment.