fixup of options

microsoft · Jan 23, 2024 · cb0f973 · cb0f973
1 parent a4cd3d7
commit cb0f973
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -156,7 +156,7 @@ you can also run tests with `pytest` for the DeclCtrl, or with `./scripts/test-p
 
 To run rLLM server, go to `rllm/` and run `./server.sh orca`.
 This will run the inference server with Orca-2 13B model (which is expected by testcases).
-If you don't have CUDA, go to `cpp-rllm/` and run `./cpp-server.sh cpu phi2`.
+If you don't have CUDA, go to `cpp-rllm/` and run `./cpp-server.sh phi2`.
 You can also try other models, see [rllm/README.md](rllm/README.md) and
 [cpp-rllm/README.md](cpp-rllm/README.md) for details.
 

diff --git a/cpp-rllm/README.md b/cpp-rllm/README.md
@@ -11,8 +11,7 @@ If you're not using the supplied docker container follow the
 To compile and run first aicirt and then the rllm server, run:
 
 ```bash
-./cpp-server.sh cpu phi2
+./cpp-server.sh phi2
 ```
 
-You can also try `gpu` instead of `gpu` which will try to use CUDA.
-
+You can also try passing `--cuda` before `phi2`.
diff --git a/cpp-rllm/cpp-server.sh b/cpp-rllm/cpp-server.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 set -e
-REL=
+REL=--release
 LOOP=
 BUILD=
 ADD_ARGS=
@@ -24,19 +24,14 @@ fi
 
 VER="--no-default-features"
 
-if [ "$1" = gpu ] ; then
-    REL=--release
+if [ "$1" = "--cuda" ] ; then
     VER="$VER --features cuda"
     shift
-elif [ "$1" = cpu ] ; then
-    REL=--release
-    shift
-elif [ "$1" = debug ] ; then
+fi
+
+if [ "$1" = "--debug" ] ; then
     REL=
     shift
-else
-    echo "usage: $0 [gpu|cpu|debug] [phi2|orca|build]"
-    exit 1
 fi
 
 case "$1" in
@@ -48,10 +43,9 @@ case "$1" in
     ;;
   build )
     BUILD=1
-    REL=--release
     ;;
   * )
-    echo "try one of models: phi2, orca" 
+    echo "usage: $0 [--cuda] [--debug] [phi2|orca|build] [rllm_args...]"
     exit 1
     ;;
 esac

diff --git a/rllm/src/llamacpp/loader.rs b/rllm/src/llamacpp/loader.rs
@@ -46,6 +46,8 @@ fn do_load(args: &mut LoaderArgs) -> Result<cpp::Model> {
         let mut mparams = cpp::ModelParams::default();
         // TODO: make this configurable
         mparams.set_split_mode(cpp::SplitMode::None);
+        // don't GPU offload on Intel macs - it just fails there
+        #[cfg(not(all(target_os = "macos", target_arch = "x86_64")))]
         mparams.n_gpu_layers = 1000;
 
         let m = cpp::Model::from_file(file.to_str().unwrap(), mparams)?;