scripts for color_universe, begin reading natural languages

CLMBRs · Sep 12, 2024 · b68da30 · b68da30
1 parent ac73f71
commit b68da30
Show file tree

Hide file tree

Showing 7 changed files with 48 additions and 18 deletions.
diff --git a/src/examples/colors/README.md b/src/examples/colors/README.md
@@ -26,6 +26,15 @@ This README first explains the contents of this example directory, focusing on w
 `util.py` contains utility functions, including the argument parser for running this tool from shell. 
 
 ## Usage
+
+From `ultk/examples` base directory:
+1. Run `python -m colors.scripts.read_color_universe`: this generates the color universe (the 330 Munsell chips) to be re-used throughout.
+    - Consumes: `data/cnum-vhcm-lab-new.txt`
+    - Produces: `outputs/color_universe.pkl`
+2. Run `python -m colors.scripts.read_natural_languages`: this reads the natural language WCS data and produces ULTK `Language` objects.  (NOTE: still a work-in-progress)
+    - Consumes: `data/data/term.txt`, `outputs/color_universe.pkl`
+    - Produces: 
+
 Run `python analyze_data.py` from the `colors` folder. This calls `generate_wcs_languages` to generate the language data, then `complexity.py` to generate the complexity, then  Several options are available as command-line settings.:
 
 

diff --git a/src/examples/colors/__init__.py b/src/examples/colors/__init__.py
diff --git a/src/examples/colors/meaning.py b/src/examples/colors/meaning.py
@@ -1,19 +1,4 @@
-import pandas as pd
+import pickle
 
-from ultk.language.semantics import Universe
-
-
-def munsell_cielab_universe(filename: str) -> Universe:
-    """
-    Load a Universe from a Munsell CIELAB file.
-
-    Note: the tuple Universe.referents will be ordered by the Munsell Chip Number, as used in the WCS data.
-    """
-    referents = pd.read_csv(filename, delimiter="\t")
-    referents.sort_values(by="#cnum", inplace=True)
-    # add a name column, as required by ULTK
-    referents["name"] = referents["#cnum"]
-    return Universe.from_dataframe(referents)
-
-
-color_universe = munsell_cielab_universe("data/cnum-vhcm-lab-new.txt")
+with open("colors/outputs/color_universe.pkl", "rb") as f:
+    color_universe = pickle.load(f)
diff --git a/src/examples/colors/outputs/color_universe.pkl b/src/examples/colors/outputs/color_universe.pkl
diff --git a/src/examples/colors/scripts/__init__.py b/src/examples/colors/scripts/__init__.py
diff --git a/src/examples/colors/scripts/read_color_universe.py b/src/examples/colors/scripts/read_color_universe.py
@@ -0,0 +1,15 @@
+import pandas as pd
+import pickle
+
+from ultk.language.semantics import Universe
+
+
+if __name__ == "__main__":
+    referents = pd.read_csv("colors/data/cnum-vhcm-lab-new.txt", delimiter="\t")
+    referents.sort_values(by="#cnum", inplace=True)
+    # add a name column, as required by ULTK
+    referents["name"] = referents["#cnum"]
+    color_universe = Universe.from_dataframe(referents)
+
+    with open("colors/outputs/color_universe.pkl", "wb") as f:
+        pickle.dump(color_universe, f)
diff --git a/src/examples/colors/scripts/read_natural_languages.py b/src/examples/colors/scripts/read_natural_languages.py
@@ -0,0 +1,21 @@
+import pandas as pd
+
+from ultk.language.language import Expression, Language
+from ..meaning import color_universe
+
+if __name__ == "__main__":
+
+    term_table = pd.read_csv(
+        "data/term.txt", delimiter="\t", names=("lang", "spkr", "cnum", "term")
+    )
+    print(term_table)
+
+    lang_term_chip_counts = term_table.groupby(["lang", "term", "cnum"]).count()
+    print(lang_term_chip_counts)
+    print(lang_term_chip_counts.index)
+
+    for lang, lang_df in lang_term_chip_counts.groupby("lang"):
+        print(lang)
+        for term, term_df in lang_df.groupby("term"):
+            print(term)
+            print(term_df)