Skip to content

Commit

Permalink
Add MT tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
ddaspit committed Oct 15, 2024
1 parent b7c06c8 commit d9ff460
Show file tree
Hide file tree
Showing 5 changed files with 1,226 additions and 15 deletions.
17 changes: 4 additions & 13 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,24 +1,15 @@
{
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",
"source.organizeImports": "explicit"
},
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.analysis.extraPaths": [
"tests"
],
"python.analysis.extraPaths": ["tests"],
"python.analysis.importFormat": "relative",
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
},
"black-formatter.path": [
"poetry",
"run",
"black"
],
"python.analysis.extraPaths": [
"./tests"
]
}
"black-formatter.path": ["poetry", "run", "black"]
}
19 changes: 18 additions & 1 deletion machine/translation/translation_suggester.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from abc import ABC, abstractmethod
from typing import Iterable, Sequence
from typing import Iterable, Optional, Sequence

from .interactive_translator import InteractiveTranslator
from .translation_result import TranslationResult
from .translation_suggestion import TranslationSuggestion
from .truecaser import Truecaser


class TranslationSuggester(ABC):
Expand All @@ -14,3 +16,18 @@ def __init__(self, confidence_threshold: float = 0, break_on_punctuation: bool =
def get_suggestions(
self, n: int, prefix_count: int, is_last_word_complete: bool, results: Iterable[TranslationResult]
) -> Sequence[TranslationSuggestion]: ...

def get_suggestions_from_translator(
self, n: int, translator: InteractiveTranslator, truecaser: Optional[Truecaser] = None
) -> Sequence[TranslationSuggestion]:
results = translator.get_current_results()
if truecaser is not None:
results = (
truecaser.truecase_translation_result(result, translator.target_detokenizer) for result in results
)
return self.get_suggestions(
n,
len(translator.prefix_word_ranges),
translator.is_last_word_complete,
results
)
21 changes: 20 additions & 1 deletion machine/translation/truecaser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from abc import ABC, abstractmethod
from typing import Sequence
from typing import Optional, Sequence

from ..corpora.text_corpus import TextCorpus
from ..tokenization.detokenizer import Detokenizer
from ..tokenization.whitespace_detokenizer import WHITESPACE_DETOKENIZER
from .trainer import Trainer
from .translation_result import TranslationResult


class Truecaser(ABC):
Expand All @@ -15,5 +18,21 @@ def train_segment(self, segment: Sequence[str], sentence_start: bool = True) ->
@abstractmethod
def truecase(self, segment: Sequence[str]) -> Sequence[str]: ...

def truecase_translation_result(
self, result: TranslationResult, detokenizer: Optional[Detokenizer] = None
) -> TranslationResult:
if detokenizer is None:
detokenizer = WHITESPACE_DETOKENIZER
target_tokens = self.truecase(result.target_tokens)
return TranslationResult(
detokenizer.detokenize(target_tokens),
result.source_tokens,
target_tokens,
result.confidences,
result.sources,
result.alignment,
result.phrases,
)

@abstractmethod
def save(self) -> None: ...
Loading

0 comments on commit d9ff460

Please sign in to comment.