Skip to content

Commit

Permalink
feat: add build probability files command
Browse files Browse the repository at this point in the history
  • Loading branch information
bolinocroustibat committed Sep 20, 2023
1 parent 953611b commit 6073686
Show file tree
Hide file tree
Showing 8 changed files with 145 additions and 11 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ pdm run uvicorn api:app --reload

## Commands

- `batch_generate.py` + language
- `build.py`
- `classify_db_generated.py` + language
- `classify_db_real.py` + language (from a dictionary TXT file)
- `build_proba_file.py` + language: Create the probability file for the Markov chain
- `batch_generate.py` + language: Generate a batch of words (500 by default) and save them in DB
- `classify_db_generated.py` + language: Update the generated words in DB with their tense, conjugation, genre, number, etc.
- `classify_db_real.py` + language (from a dictionary TXT file): Update the real words in DB with their tense, conjugation, genre, number, etc.
- `tweet.py` + language + optional: `--dry-run`

To run the commands, use for example:
```bash
python3 -m commands.batch_generate en
python3 -m commands.build_proba_file en
```

# Usefuls resources
Expand Down
134 changes: 134 additions & 0 deletions commands/build_proba_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import json
from asyncio import run as aiorun
from pathlib import Path

import typer


def build_1char_probabilities(
alphabet: list[str], dictionary_filepath: Path, json_filepath: Path
) -> dict[str, dict[str, int]]:
# Initialize the nested dictionary structure
temp: dict[str, int] = {char: 0 for char in alphabet}
temp["last_letter"] = 0
probabilities: dict[str, dict[str, int]] = {"first_letter": temp}
for char in alphabet:
probabilities[char] = temp

# Populate the dictionary with probabilities
with open(dictionary_filepath, "r", encoding="utf-8") as dictionary:
for line in dictionary:
word: str = line.strip()
first_letter: str = word[0].lower()
probabilities["first_letter"][first_letter] += 1
i = 0
while i < len(word) and word[i].lower() in alphabet:
current_char = word[i].lower()
next_char = word[i + 1].lower() if i + 1 < len(word) else None
if next_char is None or next_char not in alphabet:
probabilities[current_char]["last_letter"] += 1
break
else:
probabilities[current_char][next_char] += 1
i += 1

return probabilities


def build_2char_probabilities(
alphabet: list[str], dictionary_filepath: Path, json_filepath: Path
) -> dict:

# Initialize the nested dictionary structure
temp: dict = {}
for letter1 in alphabet:
for letter2 in alphabet:
temp[letter1 + letter2] = 0

alphabet_dict: dict = {char: 0 for char in alphabet}

temp2: dict = alphabet_dict | temp
temp3: dict = alphabet_dict | {"last_letter": 0}

probabilities: dict = {"first_letter": alphabet_dict.copy()} | {
chars: temp3.copy() for chars in temp2
}

# Populate the dictionary with probabilities
with open(dictionary_filepath, "r", encoding="utf-8") as dictionary:
for line in dictionary:
word: str = line.strip()
first_letter: str = word[0].lower()
if first_letter in alphabet:
probabilities["first_letter"][first_letter] += 1
second_letter = word[1].lower() if len(word) > 1 else None
if second_letter is not None and second_letter in alphabet:
probabilities[first_letter][second_letter] += 1
third_letter = word[2].lower() if len(word) > 2 else None
if third_letter is None:
probabilities[first_letter + second_letter]["last_letter"] += 1
else:
i = 0
while i < len(word):
char1 = word[i].lower()
char2 = word[i + 1].lower() if i + 1 < len(word) else None
char3 = word[i + 2].lower() if i + 2 < len(word) else None
if char2 in alphabet and char3 in alphabet:
probabilities[char1 + char2][char3] += 1
elif char2 in alphabet and char3 not in alphabet:
probabilities[char1 + char2]["last_letter"] += 1
break
elif char2 not in alphabet:
probabilities[char1]["last_letter"] += 1
break
i += 1

return probabilities


def build_chars_probability_file(lang: str, chars_nb: int = 2) -> None:

if lang not in ["en", "es", "fr", "it"]:
typer.secho(f"Invalid language: {lang}", fg="red")
raise typer.Abort()

if chars_nb not in [1, 2]:
typer.secho(f"Invalid nb of chars: {chars_nb}", fg="red")
raise typer.Abort()

async def _main():

current_path = Path(__file__).parent.absolute()

with open(
current_path / f"../{lang}/data/alphabet_{lang.upper()}.json"
) as infile:
alphabet: list[str] = json.load(infile)

dictionary_filepath: Path = (
current_path / f"../{lang}/data/dictionary_{lang.upper()}.txt"
)
json_filepath: Path = (
current_path
/ f"../{lang}/data/proba_table_{chars_nb}char_{lang.upper()}.json"
)

if chars_nb == 1:
probabilities: dict = build_1char_probabilities(
alphabet, dictionary_filepath, json_filepath
)
elif chars_nb == 2:
probabilities: dict = build_2char_probabilities(
alphabet, dictionary_filepath, json_filepath
)

with open(json_filepath, "w", encoding="utf-8") as outfile:
json.dump(probabilities, outfile, ensure_ascii=False)

typer.secho(f"File generated as {dictionary_filepath}.", fg="green")

aiorun(_main())


if __name__ == "__main__":
typer.run(build_chars_probability_file)
2 changes: 1 addition & 1 deletion en/data/proba_table_2char_EN.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion es/data/proba_table_2char_ES.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion fr/data/proba_table_1char_FR.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion fr/data/proba_table_2char_FR.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion it/data/proba_table_2char_IT.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "word-generator-api"
version = "1.5.2"
version = "1.5.3"
description = "Generates words that don't exist but sound English, French, Spanish or Italian, along with their altered dictionary definitions."
authors = [{ name = "Adrien Carpentier", email = "[email protected]" }]
dependencies = [
Expand Down

0 comments on commit 6073686

Please sign in to comment.