allenai · dirkgr · Mar 8, 2024 · Feb 25, 2024 · Feb 25, 2024 · Feb 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added the option to directly pass input embeddings to `OLMo` and `OLMoForCausalLM`.
 - Added support for Python 3.8.
 - Added code to throw an error if `output_attentions` is set to `True` in forward call to `OLMoForCausalLM`. This functionality hasn't been implemented yet.
+- Rename `Olmo` to `OLMo` everywhere in the codebase
 - Fixed running with data loading workers on LUMI
 
 ### Added

diff --git a/docs/NOTES.md b/docs/NOTES.md
@@ -70,10 +70,10 @@ For example, checkpoints for the run [https://wandb.ai/ai2-llm/c4-small/runs/euo
 You can load a checkpoint like this:
 
 ```python
-from olmo import Olmo, Tokenizer
+from olmo import OLMo, Tokenizer
 
 checkpoint = "gs://ai2-olmo/ai2-llm/c4-small/euox4j8q/step73000-unsharded"
-model = Olmo.from_checkpoint(checkpoint, device="cuda")
+model = OLMo.from_checkpoint(checkpoint, device="cuda")
 tokenizer = Tokenizer.from_checkpoint(checkpoint)
 ```
 

diff --git a/hf_olmo/configuration_olmo.py b/hf_olmo/configuration_olmo.py
@@ -21,8 +21,8 @@ def __init__(self, use_cache: bool = False, **kwargs):
         all_kwargs.update({"use_cache": use_cache})
         all_kwargs.update(
             {
-                "architectures": all_kwargs.get("architectures", ["OlmoModelForCausalLM"])
-                or ["OlmoModelForCausalLM"]
+                "architectures": all_kwargs.get("architectures", ["OLMoModelForCausalLM"])
+                or ["OLMoModelForCausalLM"]
             }
         )
         super().__init__(**all_kwargs)

diff --git a/hf_olmo/modeling_olmo.py b/hf_olmo/modeling_olmo.py
@@ -7,7 +7,7 @@
 from transformers.models.auto import AutoModelForCausalLM
 
 from olmo.config import ModelConfig
-from olmo.model import Olmo
+from olmo.model import OLMo
 
 from .configuration_olmo import OLMoConfig
 
@@ -34,14 +34,14 @@ class OLMoForCausalLM(PreTrainedModel):
     base_model_prefix = "model"
     _no_split_modules = ["OLMoBlock"]
 
-    def __init__(self, config: OLMoConfig, model: Optional[Olmo] = None, init_params: bool = False):
+    def __init__(self, config: OLMoConfig, model: Optional[OLMo] = None, init_params: bool = False):
         super().__init__(config)
 
         if not model:
             model_config = create_model_config_from_pretrained_config(config)
             # Initialize model (always on CPU to start with so we don't run out of GPU memory).
             model_config.init_device = "cpu"
-            self.model = Olmo(model_config, init_params=init_params)
+            self.model = OLMo(model_config, init_params=init_params)
         else:
             self.model = model
 

diff --git a/hf_olmo/tokenization_olmo_fast.py b/hf_olmo/tokenization_olmo_fast.py
@@ -4,7 +4,7 @@
 
 
 class OLMoTokenizerFast(PreTrainedTokenizerFast):
-    # Note: Olmo's tokenizer is already a wrapper around huggingface. This is potentially unnecessary.
+    # Note: OLMo's tokenizer is already a wrapper around huggingface. This is potentially unnecessary.
     pass
 
     # def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:

diff --git a/inference/NOTES.md b/inference/NOTES.md
@@ -45,12 +45,12 @@ To add an `olmo.py` module, we can basically just imitate what was done for othe
 There's one important wrinkle here: some OLMo models use *fused linear attention*. I'm not sure how GPTQ handles this or whether any existing supported models implement attention the same way. This might be something to discuss with Dirk and Pete.
 
 ```python
-Olmo(
+OLMo(
   (transformer): ModuleDict(
     (wte): Embedding(50304, 768)
     (emb_drop): Dropout(p=0.1, inplace=False)
     (blocks): ModuleList(
-      (0-11): 12 x OlmoSequentialBlock(
+      (0-11): 12 x OLMoSequentialBlock(
         (dropout): Dropout(p=0.1, inplace=False)
         (norm): LayerNorm()
         (act): SwiGLU()

diff --git a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py
@@ -13,7 +13,7 @@
 from .internlm import InternLMGPTQForCausalLM
 from .llama import LlamaGPTQForCausalLM
 from .moss import MOSSGPTQForCausalLM
-from .olmo import OlmoGPTQForCausalLM
+from .olmo import OLMoGPTQForCausalLM
 from .opt import OPTGPTQForCausalLM
 from .qwen import QwenGPTQForCausalLM
 from .rw import RWGPTQForCausalLM
@@ -24,7 +24,7 @@
     "gptj": GPTJGPTQForCausalLM,
     "gpt2": GPT2GPTQForCausalLM,
     "llama": LlamaGPTQForCausalLM,
-    "olmo": OlmoGPTQForCausalLM,
+    "olmo": OLMoGPTQForCausalLM,
     "opt": OPTGPTQForCausalLM,
     "moss": MOSSGPTQForCausalLM,
     "gpt_bigcode": GPTBigCodeGPTQForCausalLM,

diff --git a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py
@@ -1,7 +1,7 @@
 from ._base import *
 
 
-class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
+class OLMoGPTQForCausalLM(BaseGPTQForCausalLM):
     # Attribute name of Transformer layer block.
     layers_block_name = "model.transformer.blocks"
 
@@ -19,4 +19,4 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
     inside_layer_modules = [["att_proj"], ["attn_out"], ["ff_proj"], ["ff_out"]]
 
 
-__all__ = ["OlmoGPTQForCausalLM"]
+__all__ = ["OLMoGPTQForCausalLM"]
diff --git a/inference/compression/olmo_gptq_class.py b/inference/compression/olmo_gptq_class.py
@@ -1,7 +1,7 @@
 from auto_gptq.modeling._base import BaseGPTQForCausalLM
 
 
-class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
+class OLMoGPTQForCausalLM(BaseGPTQForCausalLM):
     # Attribute name of Transformer layer block.
     layers_block_name = "model.transformer.blocks"
 
@@ -17,12 +17,12 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
     inside_layer_modules = [["att_proj"], ["attn_out"], ["ff_proj"], ["ff_out"]]
 
 
-__all__ = ["OlmoGPTQForCausalLM"]
+__all__ = ["OLMoGPTQForCausalLM"]
 
 # NOTE: In progress; may change if OLMo model is updated.
 
 
-# class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
+# class OLMoGPTQForCausalLM(BaseGPTQForCausalLM):
 #     # Attribute name of Transformer layer block.
 #     layers_block_name = "transformer.blocks"  # NOTE(wadden) Correct
 #
@@ -51,4 +51,4 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM):
 #     ]
 
 
-# __all__ = ["OlmoGPTQForCausalLM"]
+# __all__ = ["OLMoGPTQForCausalLM"]
diff --git a/olmo/config.py b/olmo/config.py
@@ -23,7 +23,7 @@
 from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
 
 from .aliases import PathOrStr
-from .exceptions import OlmoConfigurationError
+from .exceptions import OLMoConfigurationError
 from .util import StrEnum
 
 __all__ = [
@@ -116,7 +116,7 @@ def new(cls: Type[C], **kwargs) -> C:
                 conf = om.merge(conf, kwargs)
             return cast(C, om.to_object(conf))
         except OmegaConfBaseException as e:
-            raise OlmoConfigurationError(str(e))
+            raise OLMoConfigurationError(str(e))
 
     @classmethod
     def load(
@@ -139,7 +139,7 @@ def load(
                 conf = om.merge(conf, om.from_dotlist(overrides))
             return cast(C, om.to_object(conf))
         except OmegaConfBaseException as e:
-            raise OlmoConfigurationError(str(e))
+            raise OLMoConfigurationError(str(e))
 
     def save(self, path: PathOrStr) -> None:
         """Save to a YAML file."""

diff --git a/olmo/data/__init__.py b/olmo/data/__init__.py
@@ -5,7 +5,7 @@
 
 from ..aliases import PathOrStr
 from ..config import DataConfig, TrainConfig
-from ..exceptions import OlmoConfigurationError
+from ..exceptions import OLMoConfigurationError
 from ..torch_util import barrier, get_global_rank, get_world_size
 from .collator import DataCollator
 from .iterable_dataset import IterableDataset
@@ -21,7 +21,7 @@ def build_memmap_dataset(
     metadata: List[Dict[str, Any]] = []
     if data_config.paths:
         if data_config.datasets:
-            raise OlmoConfigurationError("DataConfig.paths is mutually exclusive with DataConfig.datasets")
+            raise OLMoConfigurationError("DataConfig.paths is mutually exclusive with DataConfig.datasets")
         paths = data_config.paths
         for path in paths:
             metadata.append({"path": str(path)})
@@ -32,7 +32,7 @@ def build_memmap_dataset(
             paths.extend(label_paths)
             metadata.extend([{"label": label}] * len(label_paths))
     else:
-        raise OlmoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
+        raise OLMoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
     return MemMapDataset(
         *paths,
         chunk_size=train_config.model.max_sequence_length,
@@ -87,7 +87,7 @@ def build_train_dataloader(train_config: TrainConfig) -> DataLoader:
     work_dir = Path(train_config.save_folder) / "train_data"
     if get_global_rank() == 0:
         if work_dir.is_dir() and not train_config.save_overwrite:
-            raise OlmoConfigurationError(
+            raise OLMoConfigurationError(
                 "train data working directory already exists, use --save_overwrite to overwrite"
             )
         else:

diff --git a/olmo/data/memmap_dataset.py b/olmo/data/memmap_dataset.py
@@ -7,7 +7,7 @@
 import torch
 from torch.utils.data import Dataset
 
-from olmo.exceptions import OlmoEnvironmentError
+from olmo.exceptions import OLMoEnvironmentError
 
 from ..aliases import PathOrStr
 from ..util import _get_s3_client, file_size, get_bytes_range
@@ -93,7 +93,7 @@ def offsets(self) -> List[Tuple[int, int]]:
         _get_s3_client("s3")
         try:
             _get_s3_client("r2")
-        except OlmoEnvironmentError:
+        except OLMoEnvironmentError:
             # R2 might not be needed, so ignore this error. We will get an error
             # later if R2 is needed.
             pass

diff --git a/olmo/eval/__init__.py b/olmo/eval/__init__.py
@@ -5,7 +5,7 @@
 from torchmetrics import MeanMetric, Metric
 
 from ..config import EvaluatorConfig, EvaluatorType, TrainConfig
-from ..exceptions import OlmoConfigurationError
+from ..exceptions import OLMoConfigurationError
 from ..tokenizer import Tokenizer
 from ..torch_util import get_global_rank, get_world_size
 from .downstream import ICLMetric, label_to_task_map
@@ -93,7 +93,7 @@ def make_metric():
         elif eval_config.data.datasets:
             eval_metric = {label: make_metric() for label in eval_config.data.datasets.keys()}
         else:
-            raise OlmoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
+            raise OLMoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
 
         return Evaluator(
             label=eval_config.label,

diff --git a/olmo/exceptions.py b/olmo/exceptions.py
@@ -1,37 +1,37 @@
-__all__ = ["OlmoError", "OlmoConfigurationError", "OlmoCliError", "OlmoEnvironmentError", "OlmoNetworkError"]
+__all__ = ["OLMoError", "OLMoConfigurationError", "OLMoCliError", "OLMoEnvironmentError", "OLMoNetworkError"]
 
 
-class OlmoError(Exception):
+class OLMoError(Exception):
     """
     Base class for all custom OLMo exceptions.
     """
 
 
-class OlmoConfigurationError(OlmoError):
+class OLMoConfigurationError(OLMoError):
     """
     An error with a configuration file.
     """
 
 
-class OlmoCliError(OlmoError):
+class OLMoCliError(OLMoError):
     """
     An error from incorrect CLI usage.
     """
 
 
-class OlmoEnvironmentError(OlmoError):
+class OLMoEnvironmentError(OLMoError):
     """
     An error from incorrect environment variables.
     """
 
 
-class OlmoNetworkError(OlmoError):
+class OLMoNetworkError(OLMoError):
     """
     An error with a network request.
     """
 
 
-class OlmoThreadError(Exception):
+class OLMoThreadError(Exception):
     """
     Raised when a thread fails.
     """