From 69f6f3fd7a6f6e14ad38d6fe83e56aa8190ad64f Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Wed, 28 Oct 2020 10:01:18 -0400
Subject: [PATCH 01/11] Speed up block splitting

Also minor speedup to hardware build
---
 nengo_loihi/block.py                |  3 +-
 nengo_loihi/builder/split_blocks.py | 52 +++++++++++++++++++----------
 nengo_loihi/hardware/builder.py     |  6 ++--
 3 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/nengo_loihi/block.py b/nengo_loihi/block.py
index c4b98ed05..430a91892 100644
--- a/nengo_loihi/block.py
+++ b/nengo_loihi/block.py
@@ -517,13 +517,14 @@ def bits_per_axon(self, n_weights):
 
         synapse_idx_bits = 4
         n_synapses_bits = 6
+        bits_per_memunit = 64
         bits = 0
         synapses_per_block = self.n_synapses + 1
         for i in range(0, n_weights, synapses_per_block):
             n = min(n_weights - i, synapses_per_block)
             bits_i = n * bits_per_weight + synapse_idx_bits + n_synapses_bits
             # round up to nearest memory unit
-            bits_i = -64 * (-bits_i // 64)
+            bits_i = -bits_per_memunit * (-bits_i // bits_per_memunit)
             bits += bits_i
 
         return bits
diff --git a/nengo_loihi/builder/split_blocks.py b/nengo_loihi/builder/split_blocks.py
index 3e1d5a0cc..d0ba31435 100644
--- a/nengo_loihi/builder/split_blocks.py
+++ b/nengo_loihi/builder/split_blocks.py
@@ -70,7 +70,7 @@ def ceil_div(a, b):
     return -((-a) // b)
 
 
-def split_model(model):  # noqa: C901
+def split_model(model, validate=1):  # noqa: C901
     """Split blocks in the given model that exceed the hardware constraints.
 
     Will split any block that has more than the allowable number of compartments,
@@ -85,6 +85,8 @@ def split_model(model):  # noqa: C901
     ----------
     model : `nengo_loihi.builder.Model`
         The model whose blocks should be split.
+    validate : int
+        Level of validation to perform: 0 = none, 1 = minimal, 2 = maximal
 
     Returns
     -------
@@ -106,7 +108,7 @@ def split_model(model):  # noqa: C901
     synapse_map = {}
 
     for old_block in model.blocks:
-        new_blocks = split_block(old_block, model.block_shapes)
+        new_blocks = split_block(old_block, model.block_shapes, validate=validate)
         block_map[old_block] = new_blocks
 
         if len(new_blocks) == 1:
@@ -117,7 +119,9 @@ def split_model(model):  # noqa: C901
         else:
             # break apart synapses
             for old_synapse in old_block.synapses:
-                new_synapse_axons = split_synapse(old_block, old_synapse, new_blocks)
+                new_synapse_axons = split_synapse(
+                    old_block, old_synapse, new_blocks, validate=validate
+                )
                 synapse_map[old_synapse] = new_synapse_axons
 
     for old_block in model.blocks:
@@ -127,7 +131,7 @@ def split_model(model):  # noqa: C901
         split_input_axons(input, block_map, synapse_map)
 
     for probe in model.probes:
-        split_probe(probe, block_map, synapse_map)
+        split_probe(probe, block_map, synapse_map, validate=validate)
 
     new_blocks = [block for group in block_map.values() for block in group]
 
@@ -145,7 +149,7 @@ def split_model(model):  # noqa: C901
     return block_map
 
 
-def split_probe(probe, block_map, synapse_map):
+def split_probe(probe, block_map, synapse_map, validate=1):
     """Modify probe in place to target new blocks"""
     assert len(probe.target) == len(probe.slice) == len(probe.weights) == 1
     old_block = probe.target[0]
@@ -205,7 +209,8 @@ def split_probe(probe, block_map, synapse_map):
 
     ids = np.array([i for ii in ids for i in ii])
     assert ids.shape == old_comp_ids.shape
-    assert np.array_equal(np.unique(ids), old_comp_ids)
+    if validate >= 1:
+        assert np.array_equal(np.unique(ids), old_comp_ids)
 
     if is_transformed or np.array_equal(ids, old_comp_ids):
         # weighted probes don't need reindexing because summed outputs are ordered
@@ -319,7 +324,7 @@ def split_axon(old_axon, old_axon_idxs, old_atoms, new_synapses):
     return new_axons
 
 
-def split_block(old_block, block_shapes):
+def split_block(old_block, block_shapes, validate=1):
     """Break a block apart into smaller blocks, each able to fit on one core"""
     n_compartments = old_block.compartment.n_compartments
     n_in_axons = sum(synapse.n_axons for synapse in old_block.synapses)
@@ -361,7 +366,8 @@ def split_block(old_block, block_shapes):
     assert len(new_block_inds) > 0
     if len(new_block_inds) == 1:
         # if block can fit on one core, just return the current block
-        assert new_block_inds[0].set == set(range(n_compartments))
+        if validate >= 1:
+            assert new_block_inds[0].set == set(range(n_compartments))
         new_blocks = [old_block]
         return OrderedDict(zip(new_blocks, new_block_inds))
 
@@ -421,7 +427,7 @@ def split_block(old_block, block_shapes):
     return OrderedDict(zip(new_blocks, new_block_inds))
 
 
-def split_synapse(old_block, old_synapse, new_blocks):
+def split_synapse(old_block, old_synapse, new_blocks, validate=1):
     """Break a synapse apart to work with new blocks
 
     Parameters
@@ -462,9 +468,10 @@ def split_synapse(old_block, old_synapse, new_blocks):
     for axon_idx in range(old_synapse.n_axons):
         weight_idx = old_synapse.axon_weight_idx(axon_idx)
         indices = old_synapse.indices[weight_idx]
-        assert all(
-            np.array_equal(i, indices[0]) for i in indices[1:]
-        ), "All atoms must target same indices"
+        if validate >= 1:
+            assert all(
+                np.array_equal(i, indices[0]) for i in indices[1:]
+            ), "All atoms must target same indices"
         indices = indices[0]
 
         base = old_synapse.axon_compartment_base(axon_idx)
@@ -514,6 +521,7 @@ def split_synapse(old_block, old_synapse, new_blocks):
             block_comp_ids,
             axon_overlaps,
             axon_ids,
+            validate=validate,
         )
 
     logger.info(
@@ -530,7 +538,13 @@ def split_synapse(old_block, old_synapse, new_blocks):
 
 
 def set_new_synapse_weights(
-    old_synapse, old_input_axons, new_synapse, block_comp_ids, axon_overlaps, axon_ids
+    old_synapse,
+    old_input_axons,
+    new_synapse,
+    block_comp_ids,
+    axon_overlaps,
+    axon_ids,
+    validate=1,
 ):
     has_shared_weights = old_synapse.axon_to_weight_map is not None
 
@@ -543,7 +557,8 @@ def set_new_synapse_weights(
     new_axon_compartment_bases = []
 
     compartment_map = dict(zip(block_comp_ids, range(len(block_comp_ids))))
-    new_block_comp_idxs = IndicesList(range(len(block_comp_ids)))
+    if validate >= 2:
+        new_block_comp_idxs = IndicesList(range(len(block_comp_ids)))
 
     # iterate over all old axon ids that will also input to this new synapse
     for old_axon_id in axon_ids:
@@ -557,7 +572,7 @@ def set_new_synapse_weights(
             valid_comp_ids = old_axon_comp_ids
         else:
             i_valid = np.array(
-                [i in block_comp_ids for i in old_axon_comp_ids], dtype=bool
+                [i in block_comp_ids.set for i in old_axon_comp_ids.flat], dtype=bool
             )
             ww = old_weights[:, i_valid]
             ii = old_indices[:, i_valid]
@@ -599,8 +614,11 @@ def set_new_synapse_weights(
             weight_idx_map[key] = len(weights)
             weights.append(ww)
             indices.append(new_ii)
-            assert all(new_base + i in new_block_comp_idxs for i in new_ii.flat)
-        else:
+
+            if validate >= 2:
+                check_inds = new_base + new_ii
+                assert set(check_inds.flat).issubset(new_block_comp_idxs.set)
+        elif validate >= 2:
             # we have these weights/indices in memory, double check they're the same
             weight_idx = weight_idx_map[key]
             assert np.array_equal(ww, weights[weight_idx])
diff --git a/nengo_loihi/hardware/builder.py b/nengo_loihi/hardware/builder.py
index d09dbba06..21799b93d 100644
--- a/nengo_loihi/hardware/builder.py
+++ b/nengo_loihi/hardware/builder.py
@@ -296,14 +296,14 @@ def build_block(nxsdk_core, core, block, compartment_idxs, ax_range):
 
     logger.debug("Building %s on core.id=%d", block, nxsdk_core.id)
 
-    for i, bias in enumerate(block.compartment.bias):
-        bman, bexp = bias_to_manexp(bias)
+    bman, bexp = bias_to_manexp(block.compartment.bias)
+    for i, _ in enumerate(bman):
         icomp = core.compartment_cfg_idxs[block][i]
         ivth = core.vth_cfg_idxs[block][i]
 
         ii = compartment_idxs[i]
         nxsdk_core.cxCfg[ii].configure(
-            bias=bman, biasExp=bexp, vthProfile=ivth, cxProfile=icomp
+            bias=bman[i], biasExp=bexp[i], vthProfile=ivth, cxProfile=icomp
         )
 
         phasex = "phase%d" % (ii % 4,)

From 183c872354d90b76739595c64850499c8c4a89f6 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Wed, 28 Oct 2020 10:01:53 -0400
Subject: [PATCH 02/11] Explicitly test q0 and current overflow

These were being tested before, it just wasn't evident/assured.
---
 nengo_loihi/emulator/tests/test_interface.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nengo_loihi/emulator/tests/test_interface.py b/nengo_loihi/emulator/tests/test_interface.py
index e1d5659b9..fd11d5750 100644
--- a/nengo_loihi/emulator/tests/test_interface.py
+++ b/nengo_loihi/emulator/tests/test_interface.py
@@ -81,8 +81,9 @@ def test_uv_overflow(n_axons, plt, allclose, monkeypatch):
 
     assert EmulatorInterface.strict  # Tests should be run in strict mode
     monkeypatch.setattr(EmulatorInterface, "strict", False)
+    overflow_var = "q0" if n_axons == 1000 else "current"
     with EmulatorInterface(model) as emu:
-        with pytest.warns(UserWarning):
+        with pytest.warns(UserWarning, match=f"Overflow in {overflow_var}"):
             emu.run_steps(nt)
         emu_u = emu.collect_probe_output(probe_u)
         emu_v = emu.collect_probe_output(probe_v)

From 805208055ec43e60c62715ea6198ae264e31ee25 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Tue, 2 Feb 2021 16:18:50 -0500
Subject: [PATCH 03/11] Fix r in decay_magnitude docstring

---
 nengo_loihi/builder/discretize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nengo_loihi/builder/discretize.py b/nengo_loihi/builder/discretize.py
index 9ca043e09..7e1158560 100644
--- a/nengo_loihi/builder/discretize.py
+++ b/nengo_loihi/builder/discretize.py
@@ -146,7 +146,7 @@ def decay_magnitude(decay, x0=2 ** 21, bits=12, offset=0):
 
         x_i = floor(r x_{i-1})
 
-    where ``r = (2**bits - offset - decay)``.
+    where ``r = (2**bits - offset - decay) / 2**bits``.
 
     To simulate the effects of rounding in decay, we subtract an expected loss
     due to rounding (``q``) each iteration. Our estimated series is therefore::

From d131b7e05b9276f1f0cf67513b57d8da04463462 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Thu, 29 Oct 2020 11:15:33 -0400
Subject: [PATCH 04/11] Properly pass dt for preset DecodeNeurons

---
 CHANGES.rst                              |  2 ++
 nengo_loihi/decode_neurons.py            | 24 ++++++++++++++----------
 nengo_loihi/tests/test_decode_neurons.py | 17 ++++++++++++++---
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 1b98df241..a26564804 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -44,10 +44,12 @@ Release history
   is deleted. (`#312`_)
 - Fixed probe filters such that multiple ``Simulator.run`` calls now results in
   the same probe data as a single call of equivalent length. (`#271`_, `#303`_)
+- Fixed handling of ``dt`` within ``DecodeNeurons`` for ``dt != 0.001``. (`#309`_)
 
 .. _#271: https://github.com/nengo/nengo-loihi/issues/271
 .. _#289: https://github.com/nengo/nengo-loihi/pull/289
 .. _#303: https://github.com/nengo/nengo-loihi/pull/303
+.. _#309: https://github.com/nengo/nengo-loihi/pull/309
 .. _#312: https://github.com/nengo/nengo-loihi/pull/312
 .. _#317: https://github.com/nengo/nengo-loihi/pull/317
 .. _#320: https://github.com/nengo/nengo-loihi/pull/320
diff --git a/nengo_loihi/decode_neurons.py b/nengo_loihi/decode_neurons.py
index a1aeb9271..4459b4dbf 100644
--- a/nengo_loihi/decode_neurons.py
+++ b/nengo_loihi/decode_neurons.py
@@ -261,6 +261,10 @@ class Preset5DecodeNeurons(OnOffDecodeNeurons):
         nengo-loihi-sandbox/utils/interneuron_unidecoder_design.py
     """
 
+    # TODO: why does this scale factor help? Found it empirically in
+    # test_decode_neurons.test_add_inputs (see there for a description)
+    scale_factor = 1.05
+
     def __init__(self, dt=0.001, rate=None):
         super().__init__(pairs_per_dim=5, dt=dt, rate=rate)
 
@@ -270,14 +274,12 @@ def __init__(self, dt=0.001, rate=None):
         gain, bias = self.neuron_type.gain_bias(max_rates, intercepts)
 
         target_point = 0.85
-        target_rate = np.sum(self.neuron_type.rates(target_point, gain, bias))
-        self.scale = 1.08 * target_point / (self.dt * target_rate)
-        # ^ TODO: why does this 1.08 factor help? found it empirically in
-        # test_decode_neurons.test_add_inputs
+        target_rate = np.sum(self.neuron_type.rates(target_point, gain, bias, dt=dt))
+        self.scale = self.scale_factor * target_point / (self.dt * target_rate)
 
+        # repeat gains/biases for on/off neurons
         self.gain = gain.repeat(2)
         self.bias = bias.repeat(2)
-        # ^ repeat for on/off neurons
 
     def __str__(self):
         return "%s(dt=%0.3g, rate=%0.3g)" % (type(self).__name__, self.dt, self.rate)
@@ -290,6 +292,10 @@ class Preset10DecodeNeurons(OnOffDecodeNeurons):
         nengo-loihi-sandbox/utils/interneuron_unidecoder_design.py
     """
 
+    # TODO: why does this scale factor help? Found it empirically in
+    # test_decode_neurons.test_add_inputs (see there for a description)
+    scale_factor = 1.05
+
     def __init__(self, dt=0.001, rate=None):
         super().__init__(pairs_per_dim=10, dt=dt, rate=rate)
 
@@ -300,14 +306,12 @@ def __init__(self, dt=0.001, rate=None):
         gain, bias = self.neuron_type.gain_bias(max_rates, intercepts)
 
         target_point = 1.0
-        target_rate = np.sum(self.neuron_type.rates(target_point, gain, bias))
-        self.scale = 1.05 * target_point / (self.dt * target_rate)
-        # ^ TODO: why does this 1.05 factor help? found it empirically in
-        # test_decode_neurons.test_add_inputs
+        target_rate = np.sum(self.neuron_type.rates(target_point, gain, bias, dt=dt))
+        self.scale = self.scale_factor * target_point / (self.dt * target_rate)
 
+        # repeat gains/biases for on/off neurons
         self.gain = gain.repeat(2)
         self.bias = bias.repeat(2)
-        # ^ repeat for on/off neurons
 
     def __str__(self):
         return "%s(dt=%0.3g, rate=%0.3g)" % (type(self).__name__, self.dt, self.rate)
diff --git a/nengo_loihi/tests/test_decode_neurons.py b/nengo_loihi/tests/test_decode_neurons.py
index 5ecf9dbae..b58a83468 100644
--- a/nengo_loihi/tests/test_decode_neurons.py
+++ b/nengo_loihi/tests/test_decode_neurons.py
@@ -23,6 +23,15 @@
     ],
 )
 def test_add_inputs(decode_neurons, tolerance, Simulator, seed, plt):
+    """Test the addition of two inputs with DecodeNeurons.
+
+    Note: This test forms the basis for the scale factors for Preset5DecodeNeurons
+    and Preset10DecodeNeurons. It is unclear exactly why these scale factors help.
+    The best values depend on the exact inputs below, as well as the seed used for
+    this test. More testing is needed to find optimal scale factors, or (ideally)
+    get rid of them completely if we can better understand the underlying mechanics.
+    """
+
     sim_time = 2.0
     pres_time = sim_time / 4
     eval_time = sim_time / 8
@@ -36,6 +45,8 @@ def test_add_inputs(decode_neurons, tolerance, Simulator, seed, plt):
         {t: stim_values[i][1] for i, t in enumerate(stim_times)}
     )
 
+    probe_solver = nengo.solvers.LstsqL2nz(reg=0.01)
+
     with nengo.Network(seed=seed) as model:
         stim_a = nengo.Node(stim_fn_a)
         stim_b = nengo.Node(stim_fn_b)
@@ -54,9 +65,9 @@ def test_add_inputs(decode_neurons, tolerance, Simulator, seed, plt):
         stim_synapse = out_synapse.combine(nengo.Alpha(0.005)).combine(
             nengo.Alpha(0.005)
         )
-        p_stim_a = nengo.Probe(stim_a, synapse=stim_synapse)
-        p_stim_b = nengo.Probe(stim_b, synapse=stim_synapse)
-        p_c = nengo.Probe(c, synapse=out_synapse)
+        p_stim_a = nengo.Probe(stim_a, synapse=stim_synapse, solver=probe_solver)
+        p_stim_b = nengo.Probe(stim_b, synapse=stim_synapse, solver=probe_solver)
+        p_c = nengo.Probe(c, synapse=out_synapse, solver=probe_solver)
 
     build_model = Model()
     build_model.decode_neurons = decode_neurons

From fe7c03ebafaadb9703744b5595d5784027daa768 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Wed, 4 Nov 2020 12:25:07 -0500
Subject: [PATCH 05/11] Use nengo.rc.rc.float_dtype, reduce memory

- Respect nengo.rc.rc.float_dtype where we can
- Use int32 in a number of places to reduce memory
---
 CHANGES.rst                       |  3 +++
 nengo_loihi/block.py              | 14 +++++++++-----
 nengo_loihi/builder/connection.py | 22 +++++++++++++++-------
 nengo_loihi/builder/ensemble.py   | 18 ++++++++++++++----
 nengo_loihi/conv.py               | 14 +++++++-------
 5 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index a26564804..923443bdb 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -37,6 +37,8 @@ Release history
 
 - Build errors specify the associated objects, making them easier to debug. (`#289`_)
 - Deobfuscated NxSDK API calls. (`#320`_)
+- The builder now respects the `precision.bits`_ attribute in ``nengorc`` files,
+  allowing for reduced-precision builds to save memory. (`#309`_)
 
 **Fixed**
 
@@ -53,6 +55,7 @@ Release history
 .. _#312: https://github.com/nengo/nengo-loihi/pull/312
 .. _#317: https://github.com/nengo/nengo-loihi/pull/317
 .. _#320: https://github.com/nengo/nengo-loihi/pull/320
+.. _precision.bits: https://www.nengo.ai/nengo/nengorc.html#configuration-options
 
 1.0.0 (January 20, 2021)
 ========================
diff --git a/nengo_loihi/block.py b/nengo_loihi/block.py
index 430a91892..ddbee993b 100644
--- a/nengo_loihi/block.py
+++ b/nengo_loihi/block.py
@@ -154,19 +154,21 @@ class Compartment:
     def __init__(self, n_compartments, label=None):
         self.n_compartments = n_compartments
         self.label = label
+        # dtype must be float32, because of how we discretize in place to int32
+        self.dtype = np.float32
 
         # parameters specific to compartments/block
-        self.decay_u = np.ones(n_compartments, dtype=np.float32)
+        self.decay_u = np.ones(n_compartments, dtype=self.dtype)
         # ^ default to no filter
-        self.decay_v = np.zeros(n_compartments, dtype=np.float32)
+        self.decay_v = np.zeros(n_compartments, dtype=self.dtype)
         # ^ default to integration
         self.tau_s = None
         self.scale_u = True
         self.scale_v = False
 
         self.refract_delay = np.zeros(n_compartments, dtype=np.int32)
-        self.vth = np.zeros(n_compartments, dtype=np.float32)
-        self.bias = np.zeros(n_compartments, dtype=np.float32)
+        self.vth = np.zeros(n_compartments, dtype=self.dtype)
+        self.bias = np.zeros(n_compartments, dtype=self.dtype)
         self.enable_noise = np.zeros(n_compartments, dtype=bool)
 
         # parameters common to core
@@ -683,9 +685,11 @@ def _set_weights_indices(
         self,
         weights,
         indices=None,
-        weight_dtype=np.float32,
+        weight_dtype=None,
         compression=0,
     ):
+        # must be float32, because of how we discretize in place to int32
+        weight_dtype = np.float32 if weight_dtype is None else weight_dtype
         weights = [
             np.array(w, copy=False, dtype=weight_dtype, ndmin=2) for w in weights
         ]
diff --git a/nengo_loihi/builder/connection.py b/nengo_loihi/builder/connection.py
index 29a81be68..d185e18c5 100644
--- a/nengo_loihi/builder/connection.py
+++ b/nengo_loihi/builder/connection.py
@@ -13,6 +13,7 @@
 from nengo.connection import LearningRule
 from nengo.ensemble import Neurons
 from nengo.exceptions import BuildError, ValidationError
+from nengo.rc import rc
 from nengo.solvers import Solver
 
 from nengo_loihi.block import Axon, LoihiBlock, Synapse
@@ -521,6 +522,10 @@ def build_full_chip_connection(model, conn):  # noqa: C901
     if neuron_type is not None and hasattr(neuron_type, "amplitude"):
         weights = scale_matrix(weights, neuron_type.amplitude)
 
+    # to proper dtype
+    transform = transform.astype(rc.float_dtype)
+    weights = weights.astype(rc.float_dtype)
+
     # loihi_weights has shape (in, out), to match the shape by block.Synapses
     loihi_weights = weights.T
 
@@ -540,7 +545,7 @@ def build_full_chip_connection(model, conn):  # noqa: C901
             # use the same scaling as the ensemble does, to get good
             #  decodes.  Note that this assumes that the decoded value
             #  is in the range -radius to radius, which is usually true.
-            gain = 1.0 / conn.pre_obj.radius
+            gain = np.array(1.0 / conn.pre_obj.radius, dtype=rc.float_dtype)
 
             decoder_block = LoihiBlock(2 * d, label="%s" % conn)
             decoder_block.compartment.configure_nonspiking(
@@ -563,7 +568,8 @@ def build_full_chip_connection(model, conn):  # noqa: C901
             # use spiking decode neurons for on-chip connection
             if isinstance(conn.post_obj, Ensemble):
                 # loihi encoders don't include radius, so handle scaling here
-                loihi_weights = scale_matrix(loihi_weights, 1.0 / conn.post_obj.radius)
+                gain = np.array(1.0 / conn.post_obj.radius, dtype=rc.float_dtype)
+                loihi_weights = scale_matrix(loihi_weights, gain)
 
             post_d = conn.post_obj.size_in
             post_inds = np.arange(post_d, dtype=np.int32)[post_slice]
@@ -583,7 +589,7 @@ def build_full_chip_connection(model, conn):  # noqa: C901
         decoder_block.compartment.configure_filter(tau_s, dt=model.dt)
         post_tau = model.decode_tau
 
-        target_axons = -np.ones(pre_obj.n_neurons, dtype=int)
+        target_axons = -np.ones(pre_obj.n_neurons, dtype=np.int32)
         target_axons[pre_slice] = np.arange(target_axons[pre_slice].size)
         pre_slice = slice(None)
 
@@ -662,7 +668,7 @@ def build_full_chip_connection(model, conn):  # noqa: C901
         post_obj.add_synapse(syn)
         model.objs[conn]["weights"] = syn
 
-        target_axons = -np.ones(mid_obj.n_neurons, dtype=int)
+        target_axons = -np.ones(mid_obj.n_neurons, dtype=np.int32)
         target_axons[pre_slice] = np.arange(target_axons[pre_slice].size)
         assert target_axons[pre_slice].size == n1
 
@@ -684,7 +690,8 @@ def build_full_chip_connection(model, conn):  # noqa: C901
         assert post_obj.n_neurons == n2
 
         # loihi encoders don't include radius, so handle scaling here
-        loihi_weights = scale_matrix(loihi_weights, 1.0 / conn.post_obj.radius)
+        scale = np.array(1.0 / conn.post_obj.radius, dtype=rc.float_dtype)
+        loihi_weights = scale_matrix(loihi_weights, scale)
 
         syn = Synapse(n1, label="%s::decoder_weights" % conn)
         syn.set_weights(loihi_weights)
@@ -783,6 +790,7 @@ def build_conv2d_connection(model, transform, conn):
             obj=conn.post_obj.ensemble,
         )
     kernel = kernel * gain[0]
+    kernel = kernel.astype(rc.float_dtype)
 
     pop_type = model.config[conn].pop_type
     new_transform = copy.copy(transform)
@@ -802,9 +810,9 @@ def build_conv2d_connection(model, transform, conn):
             "is therefore emulator-only."
         )
 
-    target_axons = -np.ones(pre_obj.n_neurons, dtype=int)
+    target_axons = -np.ones(pre_obj.n_neurons, dtype=np.int32)
     target_axons[conn.pre_slice] = pixel_idxs(input_shape)
-    atoms = np.zeros(pre_obj.n_neurons, dtype=int)
+    atoms = np.zeros(pre_obj.n_neurons, dtype=np.int32)
     atoms[conn.pre_slice] = channel_idxs(input_shape)
 
     ax = Axon(np.prod(input_shape.spatial_shape), label="conv2d_weights")
diff --git a/nengo_loihi/builder/ensemble.py b/nengo_loihi/builder/ensemble.py
index fc1cfd2c5..0393a452f 100644
--- a/nengo_loihi/builder/ensemble.py
+++ b/nengo_loihi/builder/ensemble.py
@@ -7,14 +7,16 @@
 from nengo.builder.ensemble import BuiltEnsemble, gen_eval_points
 from nengo.dists import Distribution, get_samples
 from nengo.exceptions import BuildError
+from nengo.rc import rc
 
 from nengo_loihi.block import LoihiBlock
 from nengo_loihi.builder.builder import Builder
 
 
-def get_gain_bias(ens, rng=np.random, intercept_limit=1.0):
+def get_gain_bias(ens, rng=np.random, intercept_limit=1.0, dtype=None):
     # Modified from the Nengo version to handle `intercept_limit`
 
+    dtype = rc.float_dtype if dtype is None else dtype
     if ens.gain is not None and ens.bias is not None:
         gain = get_samples(ens.gain, ens.n_neurons, rng=rng)
         bias = get_samples(ens.bias, ens.n_neurons, rng=rng)
@@ -60,6 +62,11 @@ def get_gain_bias(ens, rng=np.random, intercept_limit=1.0):
                 "by reducing the maximum intercept value to below 1."
             )
 
+    dtype = rc.float_dtype
+    gain = gain.astype(dtype) if gain is not None else gain
+    bias = bias.astype(dtype) if bias is not None else bias
+    max_rates = max_rates.astype(dtype) if max_rates is not None else max_rates
+    intercepts = intercepts.astype(dtype) if intercepts is not None else intercepts
     return gain, bias, max_rates, intercepts
 
 
@@ -71,13 +78,14 @@ def build_ensemble(model, ens):
     # Create random number generator
     rng = np.random.RandomState(model.seeds[ens])
 
-    eval_points = gen_eval_points(ens, ens.eval_points, rng=rng)
+    eval_points = gen_eval_points(ens, ens.eval_points, rng=rng, dtype=rc.float_dtype)
 
     # Set up encoders
     if isinstance(ens.encoders, Distribution):
         encoders = get_samples(ens.encoders, ens.n_neurons, ens.dimensions, rng=rng)
+        encoders = np.asarray(encoders, dtype=rc.float_dtype)
     else:
-        encoders = npext.array(ens.encoders, min_dims=2, dtype=np.float64)
+        encoders = npext.array(ens.encoders, min_dims=2, dtype=rc.float_dtype)
 
     if ens.normalize_encoders:
         encoders /= npext.norm(encoders, axis=1, keepdims=True)
@@ -90,7 +98,9 @@ def build_ensemble(model, ens):
         )
 
     # Build the neurons
-    gain, bias, max_rates, intercepts = get_gain_bias(ens, rng, model.intercept_limit)
+    gain, bias, max_rates, intercepts = get_gain_bias(
+        ens, rng, intercept_limit=model.intercept_limit, dtype=rc.float_dtype
+    )
 
     block = LoihiBlock(ens.n_neurons, label="%s" % ens)
     block.compartment.bias[:] = bias
diff --git a/nengo_loihi/conv.py b/nengo_loihi/conv.py
index 377019ada..f7e6688f6 100644
--- a/nengo_loihi/conv.py
+++ b/nengo_loihi/conv.py
@@ -155,8 +155,8 @@ def conv2d_loihi_weights(transform):
     weights = []
     indices = []
     # compartment offset (aka. compartment base) for each axon
-    offsets = np.zeros(input_rows * input_cols, dtype=int)
-    axon_to_weight_map = np.zeros(input_rows * input_cols, dtype=int)
+    offsets = np.zeros(input_rows * input_cols, dtype=np.int32)
+    axon_to_weight_map = np.zeros(input_rows * input_cols, dtype=np.int32)
     weights_map = {}
     for i, j in itertools.product(range(input_rows), range(input_cols)):
         ij = i * input_cols + j
@@ -203,10 +203,10 @@ def conv2d_loihi_weights(transform):
 
             # --- determine indices
             # channel inds are zero, since we use same indices for each channel
-            channel_inds = np.zeros(n_channels, dtype=int)
-            row_inds = np.arange(wmask_i.sum())
-            col_inds = np.arange(wmask_j.sum())
-            filter_inds = np.arange(n_filters)
+            channel_inds = np.zeros(n_channels, dtype=np.int32)
+            row_inds = np.arange(wmask_i.sum(), dtype=np.int32)
+            col_inds = np.arange(wmask_j.sum(), dtype=np.int32)
+            filter_inds = np.arange(n_filters, dtype=np.int32)
 
             order = [channel_inds, row_inds, col_inds, filter_inds]
             shape = [n_channels, output_rows, output_cols, n_filters]
@@ -217,7 +217,7 @@ def conv2d_loihi_weights(transform):
                 shape = [shape[i] for i in (0, 3, 1, 2)]
 
             n = len(shape)
-            strides = [np.prod(shape[i + 1 :]) for i in range(n)]
+            strides = [np.prod(shape[i + 1 :], dtype=np.int32) for i in range(n)]
 
             # inds[i_0,...,i_{n-1}] = sum_{k=0}^{n-1} strides[k] * order[k][i_k]
             strided_inds = [

From c1d7e1066006e72edf224a55d1e97e8503154826 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Mon, 2 Nov 2020 16:00:33 -0500
Subject: [PATCH 06/11] Save discretize info on compartment for reference

---
 CHANGES.rst                       |  2 ++
 nengo_loihi/block.py              |  2 ++
 nengo_loihi/builder/discretize.py | 14 ++++++++++----
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 923443bdb..7624a4bda 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -32,6 +32,8 @@ Release history
 - Added ``Simulator.clear_probes`` to clear probe histories. This can help reduce memory
   usage during long runs, by running for a segment of the full run time, recording the
   relevant outputs, calling ``clear_probes``, and resuming the run. (`#303`_)
+- ``Block`` now has a ``.discretize_info`` attribute that stores parameters used
+  for discretizing that block. (`#309`_)
 
 **Changed**
 
diff --git a/nengo_loihi/block.py b/nengo_loihi/block.py
index ddbee993b..18caf72ce 100644
--- a/nengo_loihi/block.py
+++ b/nengo_loihi/block.py
@@ -178,6 +178,8 @@ def __init__(self, n_compartments, label=None):
         self.noise_exp = 0
         self.noise_at_membrane = 0
 
+        self.discretize_info = None
+
     def __str__(self):
         return "%s(%s)" % (type(self).__name__, self.label if self.label else "")
 
diff --git a/nengo_loihi/builder/discretize.py b/nengo_loihi/builder/discretize.py
index 7e1158560..462e0a617 100644
--- a/nengo_loihi/builder/discretize.py
+++ b/nengo_loihi/builder/discretize.py
@@ -242,10 +242,11 @@ def discretize_block(block):
     w_maxs = [s.max_abs_weight() for s in block.synapses]
     w_max = max(w_maxs) if len(w_maxs) > 0 else 0
 
-    p = discretize_compartment(block.compartment, w_max)
+    info = discretize_compartment(block.compartment, w_max)
     for synapse in block.synapses:
-        discretize_synapse(synapse, w_max, p["w_scale"], p["w_exp"])
-    return p["v_scale"]
+        discretize_synapse(synapse, w_max, info["w_scale"], info["w_exp"])
+
+    return info["v_scale"]
 
 
 def discretize_compartment(comp, w_max):
@@ -363,7 +364,12 @@ def discretize_compartment(comp, w_max):
     vmaxe = np.clip(np.round((np.log2(vmax + 1) - 9) * 0.5), 0, 2 ** 3 - 1)
     comp.vmax = 2 ** (9 + 2 * vmaxe) - 1
 
-    return dict(w_max=w_max, w_scale=w_scale, w_exp=w_exp, v_scale=v_scale)
+    info = dict(
+        w_max=w_max, w_exp=w_exp, v_scale=v_scale, b_scale=b_scale, w_scale=w_scale
+    )
+    comp.discretize_info = info
+
+    return info
 
 
 def discretize_synapse(synapse, w_max, w_scale, w_exp):

From 9abc52d13c237b0d997943f0581f76bc906e671f Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Tue, 24 Nov 2020 16:34:50 -0500
Subject: [PATCH 07/11] Add connection_decode_neurons

To map connections to DecodeNeurons so that users can see which
connections have DecodeNeurons and get a handle to the relevant
Ensemble or LoihiBlock if necessary.
---
 CHANGES.rst                               |  3 +++
 nengo_loihi/builder/builder.py            |  7 +++++--
 nengo_loihi/builder/connection.py         | 11 +++++-----
 nengo_loihi/builder/tests/test_builder.py | 25 +++++++++++++++++++++++
 4 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 7624a4bda..a15565c33 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -34,6 +34,9 @@ Release history
   relevant outputs, calling ``clear_probes``, and resuming the run. (`#303`_)
 - ``Block`` now has a ``.discretize_info`` attribute that stores parameters used
   for discretizing that block. (`#309`_)
+- ``Model`` now has a ``connection_decode_neurons`` attribute that maps ``Connection``
+  objects that require decode neurons to the corresponding ``Ensemble`` objects
+  implementing them. (`#309`_)
 
 **Changed**
 
diff --git a/nengo_loihi/builder/builder.py b/nengo_loihi/builder/builder.py
index 334c97715..956c55541 100644
--- a/nengo_loihi/builder/builder.py
+++ b/nengo_loihi/builder/builder.py
@@ -44,6 +44,9 @@ class Model:
 
     Attributes
     ----------
+    connection_decode_neurons : dict of {Connection: Ensemble}
+        Map of each `nengo.Connection` that requires DecodeNeurons, to the
+        `nengo.Ensemble` that implements said DecodeNeurons.
 
     Build parameters
 
@@ -127,12 +130,12 @@ def __init__(self, dt=0.001, label=None, builder=None):
         self.block_shapes = {}
         self.probes = []
 
-        # Will be filled in by the simulator __init__
-        self.split = None
+        self.connection_decode_neurons = {}
 
         # Will be filled in by the network builder
         self.toplevel = None
         self.config = None
+        self.split = None
 
         # Resources used by the build process
         self.objs = defaultdict(dict)  # maps Nengo objects to Loihi objects
diff --git a/nengo_loihi/builder/connection.py b/nengo_loihi/builder/connection.py
index d185e18c5..dcc439441 100644
--- a/nengo_loihi/builder/connection.py
+++ b/nengo_loihi/builder/connection.py
@@ -208,6 +208,7 @@ def build_host_to_chip(model, conn):
     ens.label = None if conn.label is None else "%s_ens" % conn.label
     _inherit_seed(host, ens, model, conn)
     host.build(ens)
+    model.connection_decode_neurons[conn] = ens
 
     pre2ens = Connection(
         conn.pre,
@@ -552,8 +553,6 @@ def build_full_chip_connection(model, conn):  # noqa: C901
                 dt=model.dt, vth=model.vth_nonspiking
             )
             decoder_block.compartment.bias[:] = 0
-            model.add_block(decoder_block)
-            model.objs[conn]["decoded"] = decoder_block
 
             dec_syn = Synapse(n, label="probe_decoders")
             weights2 = stack_matrices(
@@ -563,7 +562,6 @@ def build_full_chip_connection(model, conn):  # noqa: C901
 
             dec_syn.set_weights(weights2)
             decoder_block.add_synapse(dec_syn)
-            model.objs[conn]["decoders"] = dec_syn
         else:
             # use spiking decode neurons for on-chip connection
             if isinstance(conn.post_obj, Ensemble):
@@ -581,9 +579,10 @@ def build_full_chip_connection(model, conn):  # noqa: C901
                 loihi_weights, block_label="%s" % conn, syn_label="decoders"
             )
 
-            model.add_block(decoder_block)
-            model.objs[conn]["decoded"] = decoder_block
-            model.objs[conn]["decoders"] = dec_syn
+        model.add_block(decoder_block)
+        model.objs[conn]["decoded"] = decoder_block
+        model.objs[conn]["decoders"] = dec_syn
+        model.connection_decode_neurons[conn] = decoder_block
 
         # use tau_s for filter into decode neurons, decode_tau for filter out
         decoder_block.compartment.configure_filter(tau_s, dt=model.dt)
diff --git a/nengo_loihi/builder/tests/test_builder.py b/nengo_loihi/builder/tests/test_builder.py
index 14cd2d9d2..84dbfcdd3 100644
--- a/nengo_loihi/builder/tests/test_builder.py
+++ b/nengo_loihi/builder/tests/test_builder.py
@@ -57,3 +57,28 @@ def test_probemap_bad_type_error(Simulator, monkeypatch):
 def test_builder_strings():
     model = Model(label="myModel")
     assert str(model) == "Model(myModel)"
+
+
+@pytest.mark.parametrize("a_on_chip", [True, False])
+def test_connection_decode_neurons(a_on_chip, Simulator):
+    with nengo.Network() as net:
+        nengo_loihi.add_params(net)
+
+        u = nengo.Node([1], label="u")
+        a = nengo.Ensemble(100, 1, label="a")
+        net.config[a].on_chip = a_on_chip
+        b = nengo.Ensemble(100, 1, label="b")
+        probe1 = nengo.Probe(b)
+        nengo.Probe(b.neurons)
+        conn1 = nengo.Connection(u, a)
+        conn2 = nengo.Connection(a, b)
+
+    with Simulator(net) as sim:
+        dic = sim.model.connection_decode_neurons
+        assert isinstance(dic.pop(conn1 if a_on_chip else conn2), nengo.Ensemble)
+        if a_on_chip:
+            assert isinstance(dic.pop(conn2), nengo_loihi.block.LoihiBlock)
+
+        conn3, dec3 = dic.popitem()
+        assert conn3.pre == b and conn3.post == probe1
+        assert len(dic) == 0

From 8757458d3c50985739c2949aa38b5e745b18fae6 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Mon, 9 Nov 2020 15:22:55 -0500
Subject: [PATCH 08/11] Add GreedyComms allocator

To reduce inter-chip communication.
---
 CHANGES.rst                                   |   2 +
 nengo_loihi/builder/builder.py                |   1 +
 nengo_loihi/builder/network.py                |   7 +-
 nengo_loihi/hardware/allocators.py            | 201 ++++++++++++++++++
 nengo_loihi/hardware/tests/test_allocators.py |  55 ++++-
 5 files changed, 264 insertions(+), 2 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index a15565c33..4b07dceb6 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -37,6 +37,8 @@ Release history
 - ``Model`` now has a ``connection_decode_neurons`` attribute that maps ``Connection``
   objects that require decode neurons to the corresponding ``Ensemble`` objects
   implementing them. (`#309`_)
+- Added the ``GreedyComms`` allocator, which reduces inter-chip communication, speeding
+  up networks with high traffic between chips. (`#309`_)
 
 **Changed**
 
diff --git a/nengo_loihi/builder/builder.py b/nengo_loihi/builder/builder.py
index 956c55541..09a2b8234 100644
--- a/nengo_loihi/builder/builder.py
+++ b/nengo_loihi/builder/builder.py
@@ -129,6 +129,7 @@ def __init__(self, dt=0.001, label=None, builder=None):
         self.blocks = OrderedDict()
         self.block_shapes = {}
         self.probes = []
+        self.block_comp_map = {}
 
         self.connection_decode_neurons = {}
 
diff --git a/nengo_loihi/builder/network.py b/nengo_loihi/builder/network.py
index 145ce872c..5ed946ee8 100644
--- a/nengo_loihi/builder/network.py
+++ b/nengo_loihi/builder/network.py
@@ -40,7 +40,12 @@ def build_network(
             model.build(conn)
 
         # Split blocks into blocks that will fit on cores
-        split_model(model)
+        block_map = split_model(model)
+        model.block_comp_map = {
+            new_block: comp_idxs
+            for old_block, new_blocks in block_map.items()
+            for new_block, comp_idxs in new_blocks.items()
+        }
 
         if discretize:
             discretize_model(model)
diff --git a/nengo_loihi/hardware/allocators.py b/nengo_loihi/hardware/allocators.py
index 99697a05d..a2bc2448e 100644
--- a/nengo_loihi/hardware/allocators.py
+++ b/nengo_loihi/hardware/allocators.py
@@ -214,3 +214,204 @@ def get_chip(i):
         logger.info("Round-robin allocation across %d chips", board.n_chips)
 
         return board
+
+
+def ens_to_block_rates(model, ens_rates):
+    block_rates = {}
+    for ens, rates in ens_rates.items():
+        if ens not in model.objs:
+            if ens in model.host_pre.sig or ens in model.host.sig:
+                continue  # this ensemble is not on chip, so skip it
+
+            raise ValueError(f"Ensemble {ens} does not appear in the model")
+
+        assert len(rates) == ens.n_neurons
+        blocks = model.objs[ens]["out"]
+        blocks = blocks if isinstance(blocks, (list, tuple)) else [blocks]
+
+        for block in blocks:
+            comp_idxs = model.block_comp_map.get(block, None)
+            if comp_idxs is None:
+                assert len(blocks) == 1
+                assert block.compartment.n_compartments == ens.n_neurons
+                block_rates[block] = rates
+            else:
+                block_rates[block] = rates[comp_idxs]
+
+    return block_rates
+
+
+def compute_block_conns(block_map, block_rates=None, conns_in=False):  # noqa: C901
+    # --- store number of axons from block i to block j
+    block_conns = {k: {} for k in block_map}
+    if conns_in:
+        block_conns_in = {k: {} for k in block_map}
+
+    synapse_block_map = {}
+    for i, block_i in block_map.items():
+        for synapse in block_i.synapses:
+            assert id(synapse) not in synapse_block_map
+            synapse_block_map[id(synapse)] = i
+
+    for i, block_i in block_map.items():
+        for axon in block_i.axons:
+            j = synapse_block_map[id(axon.target)]
+
+            if i == j:
+                # don't care about self connections
+                continue
+
+            # use non-zero value as default, so that even if all rates are zero, this
+            # still gets recognized as a connection from i to j
+            block_conns[i].setdefault(j, 1e-16)
+            if conns_in:
+                block_conns_in[j].setdefault(i, 1e-16)
+
+            if block_rates is None:
+                val = axon.n_axons
+            elif block_i not in block_rates:
+                raise KeyError(f"block {block_i} not in block_rates")
+            else:
+                rates = block_rates[block_i]
+                comp_idxs = np.arange(block_i.compartment.n_compartments)
+                axon_ids = axon.map_axon(comp_idxs)
+                assert axon_ids.size == rates.size
+                val = rates[axon_ids >= 0].sum()
+
+            block_conns[i][j] += val
+            if conns_in:
+                block_conns_in[j][i] += val
+
+    return (block_conns, block_conns_in) if conns_in else block_conns
+
+
+def measure_interchip_conns(board, block_rates=None):
+    i = 0
+    block_map = {}
+    block_chip = {}
+    for chip in board.chips:
+        chip_idx = board.chip_idxs[chip]
+        for core in chip.cores:
+            # core_idx = chip.core_idxs[core]
+            for block in core.blocks:
+                block_map[i] = block
+                block_chip[i] = chip_idx
+                i += 1
+
+    block_conns = compute_block_conns(block_map, block_rates=block_rates)
+
+    stats = {"interchip": 0, "intrachip": 0}
+    stats["interchip_pairs"] = []
+    stats["intrachip_pairs"] = []
+    for i, block in block_map.items():
+        chip_idx_i = block_chip[i]
+        for j, weight in block_conns[i].items():
+            if i == j:
+                continue
+
+            chip_idx_j = block_chip[j]
+            key = "intrachip" if chip_idx_i == chip_idx_j else "interchip"
+            stats[key] += weight
+            stats[f"{key}_pairs"].append((block_map[i], block_map[j]))
+
+    return stats
+
+
+class GreedyComms(Greedy):
+    """Assigns each block to a core, using as few chips as possible, minimizing comms.
+
+    A variant of the `.Greedy` allocator that also minimizes inter-chip communication.
+
+    Starts by arbitrarily assigning a block to a chip. Then adds the block that has the
+    most communication with the first block to that same chip. Continue adding blocks
+    with the most communication to already placed blocks, until the chip is full. Then
+    start a new chip using the block with the least communication.
+    """
+
+    def __init__(self, cores_per_chip=128, ensemble_rates=None):
+        super().__init__(cores_per_chip=cores_per_chip)
+        self.ensemble_rates = ensemble_rates
+
+    def __call__(self, model, n_chips):  # noqa: C901
+        block_map = dict(enumerate(model.blocks))
+        block_rates = (
+            ens_to_block_rates(model, self.ensemble_rates)
+            if self.ensemble_rates is not None
+            else None
+        )
+        block_conns_out, block_conns_in = compute_block_conns(
+            block_map, block_rates=block_rates, conns_in=True
+        )
+
+        # find blocks with no pre block
+        no_pre_blocks = []
+        for i in block_map:
+            if sum(v for v in block_conns_in[i].values()) == 0:
+                no_pre_blocks.append(i)
+
+        # --- create board
+        board = Board()
+
+        # add inputs to board
+        for input in model.inputs:
+            self.input_to_board(input, board)
+
+        # --- add blocks to chips
+        chip = None
+        unallocated_blocks = set(block_map)
+
+        while len(unallocated_blocks) > 0:
+            if chip is None or len(chip.cores) == self.cores_per_chip:
+                assert (
+                    len(board.chips) < n_chips
+                ), f"The network needs more chips than requested ({n_chips})"
+
+                # start a new chip
+                chip = board.new_chip()
+
+                # choose a no-pre block, if possible
+                for block_idx in no_pre_blocks:
+                    if block_idx in unallocated_blocks:
+                        break
+                else:
+                    block_idx = next(iter(unallocated_blocks))
+
+                chip_blocks = set()
+            else:
+                # choose the block with the largest connection to blocks on this chip
+                block_idx = -1
+                max_conn = 0
+                for i in chip_blocks:
+                    for j in unallocated_blocks.intersection(block_conns_out[i]):
+                        ij = block_conns_out[i][j]
+                        if ij > max_conn:
+                            max_conn = ij
+                            block_idx = j
+
+                    for j in unallocated_blocks.intersection(block_conns_in[i]):
+                        ij = block_conns_in[i][j]
+                        if ij > max_conn:
+                            max_conn = ij
+                            block_idx = j
+
+                if block_idx < 0:
+                    # none of the remaining blocks connect to blocks on this chip,
+                    # so pick a no-pre block if possible, otherwise any block will do.
+                    for block_idx in no_pre_blocks:
+                        if block_idx in unallocated_blocks:
+                            break
+                    else:
+                        block_idx = next(iter(unallocated_blocks))
+
+            block = block_map[block_idx]
+            self.block_to_new_core(block, chip)
+
+            chip_blocks.add(block_idx)
+            unallocated_blocks.remove(block_idx)
+
+        # add probes
+        board.probes.extend(model.probes)
+
+        logger.info("GreedyComms allocation across %d chips", board.n_chips)
+
+        return board
diff --git a/nengo_loihi/hardware/tests/test_allocators.py b/nengo_loihi/hardware/tests/test_allocators.py
index 27e4f9f57..94655aa81 100644
--- a/nengo_loihi/hardware/tests/test_allocators.py
+++ b/nengo_loihi/hardware/tests/test_allocators.py
@@ -7,7 +7,14 @@
 from nengo_loihi.block import Axon, LoihiBlock, Synapse
 from nengo_loihi.builder import Model
 from nengo_loihi.builder.discretize import discretize_model
-from nengo_loihi.hardware.allocators import Greedy, RoundRobin, core_stdp_pre_cfgs
+from nengo_loihi.hardware.allocators import (
+    Greedy,
+    GreedyComms,
+    RoundRobin,
+    core_stdp_pre_cfgs,
+    ens_to_block_rates,
+    measure_interchip_conns,
+)
 from nengo_loihi.hardware.nxsdk_objects import Board
 from nengo_loihi.inputs import LoihiInput
 
@@ -163,6 +170,52 @@ def test_greedy_chip_allocator_cfg_check():
         Greedy(cores_per_chip=130)(model, n_chips=4)
 
 
+@pytest.mark.parametrize("Allocator", [GreedyComms])
+def test_comms_allocators(Allocator, Simulator):
+    rng = np.random.RandomState(1)  # same seed for all allocators, to compare
+    with nengo.Network(seed=0) as net:
+        n_ensembles = 256
+        n_neurons = rng.randint(64, 256, size=n_ensembles)
+        ensembles = [nengo.Ensemble(n, dimensions=1) for n in n_neurons]
+
+        conn_pairs = rng.randint(0, n_ensembles, size=(2 * n_ensembles, 2))
+        for i, j in conn_pairs:
+            ei, ej = ensembles[i].neurons, ensembles[j].neurons
+            nengo.Connection(
+                ei,
+                ej,
+                transform=rng.uniform(-0.1, 0.1, size=(ej.size_in, ei.size_out)),
+            )
+
+    ens_rates = {
+        ensemble: rng.uniform(1, 100, size=1)
+        * rng.uniform(0.9, 1, size=ensemble.n_neurons)
+        for ensemble in ensembles
+    }
+
+    with Simulator(net, target="sim") as sim:
+        model = sim.model
+        n_chips = 3
+        block_rates = ens_to_block_rates(model, ens_rates)
+        board_norates = Allocator()(model, n_chips=n_chips)
+        board_rates = Allocator(ensemble_rates=ens_rates)(model, n_chips=n_chips)
+
+    norates_axons = measure_interchip_conns(board_norates)
+    norates_spikes = measure_interchip_conns(board_norates, block_rates=block_rates)
+    rates_axons = measure_interchip_conns(board_rates)
+    rates_spikes = measure_interchip_conns(board_rates, block_rates=block_rates)
+
+    print(
+        f"No rates: {norates_axons['interchip']} axons, "
+        f"{norates_spikes['interchip']} spikes"
+    )
+    print(
+        f"Rates: {rates_axons['interchip']} axons, {rates_spikes['interchip']} spikes"
+    )
+    assert norates_axons["interchip"] < rates_axons["interchip"]
+    assert rates_spikes["interchip"] < norates_spikes["interchip"]
+
+
 @pytest.mark.slow
 @pytest.mark.target_loihi
 def test_deterministic_network_allocation(Simulator, seed):

From 7cf1281ebc3918d252da00865632932d656bd607 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Tue, 17 Nov 2020 14:01:37 -0500
Subject: [PATCH 09/11] Add PartitionComms allocator

---
 CHANGES.rst                                   |  6 +-
 nengo_loihi/hardware/allocators.py            | 85 +++++++++++++++++++
 nengo_loihi/hardware/tests/test_allocators.py |  6 +-
 3 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 4b07dceb6..aeea53076 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -37,8 +37,10 @@ Release history
 - ``Model`` now has a ``connection_decode_neurons`` attribute that maps ``Connection``
   objects that require decode neurons to the corresponding ``Ensemble`` objects
   implementing them. (`#309`_)
-- Added the ``GreedyComms`` allocator, which reduces inter-chip communication, speeding
-  up networks with high traffic between chips. (`#309`_)
+- Added the ``GreedyComms`` and ``PartitionComms`` allocators, which reduce inter-chip
+  communication, speeding up networks with high traffic between chips.
+  ``PartitionComms`` typically finds a more optimal partitioning than ``GreedyComms``,
+  but does require the ``nxmetis`` package. (`#309`_)
 
 **Changed**
 
diff --git a/nengo_loihi/hardware/allocators.py b/nengo_loihi/hardware/allocators.py
index a2bc2448e..0c8c2665f 100644
--- a/nengo_loihi/hardware/allocators.py
+++ b/nengo_loihi/hardware/allocators.py
@@ -415,3 +415,88 @@ def __call__(self, model, n_chips):  # noqa: C901
         logger.info("GreedyComms allocation across %d chips", board.n_chips)
 
         return board
+
+
+class PartitionComms(Allocator):
+    """Uses METIS partitioner to spread blocks across all chips, minimizing comms.
+
+    Spreads blocks equally across cores and minimizes inter-chip communication.
+
+    Requires `nxmetis <https://networkx-metis.readthedocs.io/en/latest/install.html>`.
+    """
+
+    # TODO:
+    # - Potentially allow more blocks on one chip (i.e. unbalanced partitioning),
+    #   if it will improve communication. Unclear if nxmetis supports this.
+    # - Check that partitioning is always balanced, and that no chips
+    #   will have too many cores. Initial tests show that it is always balanced.
+
+    def __init__(self, ensemble_rates=None, rate_scale=1):
+        import networkx  # pylint: disable=import-outside-toplevel
+        import nxmetis  # pylint: disable=import-outside-toplevel
+
+        super().__init__()
+        self.ensemble_rates = ensemble_rates
+        self.rate_scale = rate_scale
+
+        self.networkx = networkx
+        self.nxmetis = nxmetis
+
+    def __call__(self, model, n_chips):
+        block_map = dict(enumerate(model.blocks))
+
+        block_rates = None
+        if self.ensemble_rates is not None:
+            block_rates = ens_to_block_rates(model, self.ensemble_rates)
+            block_rates = {
+                block: np.round(rate * self.rate_scale)
+                for block, rate in block_rates.items()
+            }
+
+        block_conns = compute_block_conns(block_map, block_rates=block_rates)
+
+        # partition graph
+        G = self.networkx.Graph()
+        G.add_nodes_from(block_map.keys())
+
+        edge_map = set()
+        for i in block_map:
+            for j, val in block_conns[i].items():
+                if (i, j) in edge_map or (j, i) in edge_map:
+                    continue
+
+                val = val + block_conns[j].get(i, 0)
+                # G.add_edge(i, j, weight=float(val))
+                G.add_edge(i, j, weight=int(round(val)))  # weights must be integers
+                edge_map.add((i, j))
+                edge_map.add((j, i))
+
+        objval, parts = self.nxmetis.partition(G, nparts=int(n_chips))
+
+        for i, part in enumerate(parts):
+            if len(part) > 128:
+                raise ValueError(
+                    f"Partition {i} has {len(part)} cores, "
+                    "which exceeds the available 128 cores"
+                )
+
+        # --- create board
+        board = Board()
+
+        # add inputs to board
+        for input in model.inputs:
+            self.input_to_board(input, board)
+
+        # blocks to chips
+        for part in parts:
+            chip = board.new_chip()
+            for block_idx in part:
+                block = block_map[block_idx]
+                self.block_to_new_core(block, chip)
+
+        # add probes
+        board.probes.extend(model.probes)
+
+        logger.info("METIS allocation across %d chips", board.n_chips)
+
+        return board
diff --git a/nengo_loihi/hardware/tests/test_allocators.py b/nengo_loihi/hardware/tests/test_allocators.py
index 94655aa81..1304320f3 100644
--- a/nengo_loihi/hardware/tests/test_allocators.py
+++ b/nengo_loihi/hardware/tests/test_allocators.py
@@ -10,6 +10,7 @@
 from nengo_loihi.hardware.allocators import (
     Greedy,
     GreedyComms,
+    PartitionComms,
     RoundRobin,
     core_stdp_pre_cfgs,
     ens_to_block_rates,
@@ -170,8 +171,11 @@ def test_greedy_chip_allocator_cfg_check():
         Greedy(cores_per_chip=130)(model, n_chips=4)
 
 
-@pytest.mark.parametrize("Allocator", [GreedyComms])
+@pytest.mark.parametrize("Allocator", [GreedyComms, PartitionComms])
 def test_comms_allocators(Allocator, Simulator):
+    if Allocator is PartitionComms:
+        pytest.importorskip("nxmetis")
+
     rng = np.random.RandomState(1)  # same seed for all allocators, to compare
     with nengo.Network(seed=0) as net:
         n_ensembles = 256

From 7732a14cd5d3f1891dfe33361d5d799f8d2ef729 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Wed, 1 Dec 2021 12:21:13 -0500
Subject: [PATCH 10/11] Fix NxSDK import

To ensure that HAS_NXSDK is never false if nxsdk is installed.
---
 nengo_loihi/hardware/nxsdk_shim.py | 50 ++++++++++++++----------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/nengo_loihi/hardware/nxsdk_shim.py b/nengo_loihi/hardware/nxsdk_shim.py
index 193f9c550..02ccf39e1 100644
--- a/nengo_loihi/hardware/nxsdk_shim.py
+++ b/nengo_loihi/hardware/nxsdk_shim.py
@@ -29,8 +29,6 @@ def parse_nxsdk_version(nxsdk):
     nxsdk_dir = os.path.realpath(os.path.join(os.path.dirname(nxsdk.__file__), ".."))
     nxsdk_version = parse_nxsdk_version(nxsdk)
 
-    import nxsdk.graph.graph as snip_maker
-
     def assert_nxsdk():
         pass
 
@@ -40,7 +38,6 @@ def assert_nxsdk():
     nxsdk = None
     nxsdk_dir = None
     nxsdk_version = None
-    snip_maker = None
 
     exception = sys.exc_info()[1]
 
@@ -49,6 +46,29 @@ def assert_nxsdk(exception=exception):
 
 
 if HAS_NXSDK:  # noqa: C901
+    import nxsdk.compiler.microcodegen.interface as micro_gen
+    import nxsdk.graph.graph as snip_maker
+    from nxsdk.graph.nxinputgen.nxinputgen import BasicSpikeGenerator as SpikeGen
+    from nxsdk.graph.nxprobes import N2SpikeProbe as SpikeProbe
+    from nxsdk.graph.processes.phase_enums import Phase as SnipPhase
+
+    try:
+        # try new location (nxsdk > 0.9.0)
+        from nxsdk.arch.n2a.compiler.tracecfggen.tracecfggen import (
+            TraceCfgGen as TraceConfigGenerator,
+        )
+    except ImportError:  # pragma: no cover
+        # try old location (nxsdk <= 0.9.0)
+        from nxsdk.compiler.tracecfggen.tracecfggen import (
+            TraceCfgGen as TraceConfigGenerator,
+        )
+
+    try:
+        # try new location (nxsdk >= 1.0.0)
+        from nxsdk.arch.n2a.n2board import N2Board as NxsdkBoard
+    except ImportError:  # pragma: no cover
+        # try old location (nxsdk < 1.0.0)
+        from nxsdk.graph.nxboard import N2Board as NxsdkBoard
 
     class SnipMaker(snip_maker.Graph):
         """Patch of the snip process manager that is multiprocess safe."""
@@ -115,30 +135,6 @@ def createSnip(self, phase, *args, **kwargs):
             return super().createSnip(phase, *args, **kwargs)
 
     snip_maker.Graph = SnipMaker
-
-    import nxsdk.compiler.microcodegen.interface as micro_gen
-
-    try:
-        # try new location (nxsdk > 0.9.0)
-        from nxsdk.arch.n2a.compiler.tracecfggen.tracecfggen import (
-            TraceCfgGen as TraceConfigGenerator,
-        )
-    except ImportError:  # pragma: no cover
-        # try old location (nxsdk <= 0.9.0)
-        from nxsdk.compiler.tracecfggen.tracecfggen import (
-            TraceCfgGen as TraceConfigGenerator,
-        )
-
-    try:
-        # try new location (nxsdk >= 1.0.0)
-        from nxsdk.arch.n2a.n2board import N2Board as NxsdkBoard
-    except ImportError:  # pragma: no cover
-        # try old location (nxsdk < 1.0.0)
-        from nxsdk.graph.nxboard import N2Board as NxsdkBoard
-
-    from nxsdk.graph.nxinputgen.nxinputgen import BasicSpikeGenerator as SpikeGen
-    from nxsdk.graph.nxprobes import N2SpikeProbe as SpikeProbe
-    from nxsdk.graph.processes.phase_enums import Phase as SnipPhase
 else:
     SnipMaker = None
     micro_gen = None

From 4c42c65d81d5a0abfc10c5a1f3520bda22bd0bc8 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Tue, 10 Nov 2020 13:49:06 -0500
Subject: [PATCH 11/11] Add LoihiRectifiedLinear

---
 CHANGES.rst                       |   3 +
 nengo_loihi/builder/nengo_dl.py   |  34 ++++++++--
 nengo_loihi/neurons.py            |  13 ++++
 nengo_loihi/tests/test_neurons.py | 101 ++++++++++++++----------------
 4 files changed, 91 insertions(+), 60 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index aeea53076..a51c1a1f5 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -41,6 +41,9 @@ Release history
   communication, speeding up networks with high traffic between chips.
   ``PartitionComms`` typically finds a more optimal partitioning than ``GreedyComms``,
   but does require the ``nxmetis`` package. (`#309`_)
+- Added the ``LoihiRectifiedLinear`` neuron type to train deep networks for Loihi using
+  Nengo or NengoDL. It is a rate neuron type and thus must ultimitely be swapped for
+  ``LoihiSpikingRectifiedLinear`` to run on Loihi. (`#309`_)
 
 **Changed**
 
diff --git a/nengo_loihi/builder/nengo_dl.py b/nengo_loihi/builder/nengo_dl.py
index 978fff2b8..3b1051d78 100644
--- a/nengo_loihi/builder/nengo_dl.py
+++ b/nengo_loihi/builder/nengo_dl.py
@@ -7,6 +7,7 @@
 from nengo_loihi.neurons import (
     AlphaRCNoise,
     LoihiLIF,
+    LoihiRectifiedLinear,
     LoihiSpikingRectifiedLinear,
     LowpassRCNoise,
     discretize_tau_rc,
@@ -16,7 +17,7 @@
 if HAS_DL:
     import nengo_dl
     import tensorflow as tf
-    from nengo_dl.neuron_builders import LIFBuilder, SpikingRectifiedLinearBuilder
+    from nengo_dl.neuron_builders import LIFBuilder, TFNeuronBuilder
 else:  # pragma: no cover
     # Empty classes so that we can define the subclasses even though
     # we will never use them, as they are only used in the `install`
@@ -24,7 +25,7 @@
     class LIFBuilder:
         pass
 
-    class SpikingRectifiedLinearBuilder:
+    class TFNeuronBuilder:
         pass
 
 
@@ -260,8 +261,8 @@ def step(self, J, dt, voltage, refractory_time):
         )
 
 
-class LoihiSpikingRectifiedLinearBuilder(SpikingRectifiedLinearBuilder):
-    """nengo_dl builder for the LoihiSpikingRectifiedLinear neuron type."""
+class LoihiRectifiedLinearBuilder(TFNeuronBuilder):
+    """nengo_dl builder for the LoihiRectifiedLinear neuron type."""
 
     def build_pre(self, signals, config):
         super().build_pre(signals, config)
@@ -276,7 +277,7 @@ def build_pre(self, signals, config):
         self.zero = signals.zero
         self.one = signals.one
 
-    def training_step(self, J, dt, **state):
+    def step(self, J, dt):
         # Since LoihiLIF takes `ceil(period/dt)` the firing rate is
         # always below the LIF rate. Using `tau_ref1` in LIF curve makes
         # it the average of the LoihiLIF curve (rather than upper bound).
@@ -297,6 +298,23 @@ def training_step(self, J, dt, **state):
         #     loihi_rates on forward pass, rates on backwards
         return rates + tf.stop_gradient(loihi_rates - rates)
 
+
+class LoihiSpikingRectifiedLinearBuilder(LoihiRectifiedLinearBuilder):
+    """nengo_dl builder for the LoihiSpikingRectifiedLinear neuron type."""
+
+    def build_pre(self, signals, config):
+        super().build_pre(signals, config)
+
+        self.zeros = tf.zeros(
+            (signals.minibatch_size,) + self.J_data.shape, signals.dtype
+        )
+
+        self.epsilon = tf.constant(1e-15, dtype=signals.dtype)
+
+        # copy these so that they're easily accessible in _step functions
+        self.zero = signals.zero
+        self.one = signals.one
+
     def step(self, J, dt, voltage):
         voltage += J * dt
         spiked = voltage > self.one
@@ -308,6 +326,9 @@ def step(self, J, dt, voltage):
         # being used at all)
         return tf.stop_gradient(spikes), tf.stop_gradient(voltage)
 
+    def training_step(self, J, dt, **state):
+        return super().step(J, dt)
+
 
 class Installer:
     def __init__(self):
@@ -323,6 +344,9 @@ def __call__(self):
             nengo_dl.neuron_builders.SimNeuronsBuilder.TF_NEURON_IMPL[
                 LoihiLIF
             ] = LoihiLIFBuilder
+            nengo_dl.neuron_builders.SimNeuronsBuilder.TF_NEURON_IMPL[
+                LoihiRectifiedLinear
+            ] = LoihiRectifiedLinearBuilder
             nengo_dl.neuron_builders.SimNeuronsBuilder.TF_NEURON_IMPL[
                 LoihiSpikingRectifiedLinear
             ] = LoihiSpikingRectifiedLinearBuilder
diff --git a/nengo_loihi/neurons.py b/nengo_loihi/neurons.py
index c3fa94d5c..c49ae6355 100644
--- a/nengo_loihi/neurons.py
+++ b/nengo_loihi/neurons.py
@@ -184,6 +184,19 @@ def step(self, dt, J, output, voltage, refractory_time):
         refractory_time[spikes_mask] = tau_ref + dt
 
 
+class LoihiRectifiedLinear(RectifiedLinear):
+    def __init__(self, amplitude=1, **kwargs):
+        super().__init__(amplitude=amplitude, **kwargs)
+        _install_dl_builders()
+
+    def rates(self, x, gain, bias, dt=0.001):
+        return loihi_spikingrectifiedlinear_rates(self, x, gain, bias, dt)
+
+    def step(self, dt, J, output):
+        output[:] = 0
+        output[J > 0] = (self.amplitude / dt) / np.ceil(np.reciprocal(dt * J[J > 0]))
+
+
 class LoihiSpikingRectifiedLinear(SpikingRectifiedLinear):
     """Simulate spiking rectified linear neurons as done by Loihi.
 
diff --git a/nengo_loihi/tests/test_neurons.py b/nengo_loihi/tests/test_neurons.py
index c9d02b0d2..a7ce60c3b 100644
--- a/nengo_loihi/tests/test_neurons.py
+++ b/nengo_loihi/tests/test_neurons.py
@@ -9,6 +9,7 @@
 from nengo_loihi.neurons import (
     AlphaRCNoise,
     LoihiLIF,
+    LoihiRectifiedLinear,
     LoihiSpikingRectifiedLinear,
     LowpassRCNoise,
     discretize_tau_rc,
@@ -107,19 +108,35 @@ def test_loihi_rates_other_type(neuron_type, allclose):
     assert allclose(rates, ref_rates)
 
 
-@pytest.mark.parametrize("neuron_type", [LoihiLIF(), LoihiSpikingRectifiedLinear()])
-def test_loihi_neurons(neuron_type, Simulator, plt, allclose):
+@pytest.mark.parametrize(  # noqa: C901
+    "NeuronType", [LoihiLIF, LoihiRectifiedLinear, LoihiSpikingRectifiedLinear]
+)
+@pytest.mark.parametrize("inference_only", [True, False] if HAS_DL else [None])
+def test_loihi_neurons(
+    NeuronType, inference_only, Simulator, plt, allclose, monkeypatch
+):
+    if HAS_DL:
+        # "uninstall" NengoDL builders to make sure each neuron type reinstalls them
+        monkeypatch.setattr(install_dl_builders, "installed", False)
+
+    neuron_type = NeuronType()
+    if HAS_DL:
+        assert install_dl_builders.installed
+
     dt = 0.0007
 
     n = 256
     encoders = np.ones((n, 1))
     gain = np.zeros(n)
-    if isinstance(neuron_type, nengo.SpikingRectifiedLinear):
+    if isinstance(neuron_type, nengo.RectifiedLinear):
         bias = np.linspace(0, 1001, n)
     else:
         bias = np.linspace(0, 30, n)
 
     with nengo.Network() as model:
+        if HAS_DL:
+            nengo_dl.configure_settings(inference_only=inference_only)
+
         ens = nengo.Ensemble(
             n, 1, neuron_type=neuron_type, encoders=encoders, gain=gain, bias=bias
         )
@@ -129,69 +146,43 @@ def test_loihi_neurons(neuron_type, Simulator, plt, allclose):
     with nengo.Simulator(model, dt=dt) as nengo_sim:
         nengo_sim.run(t_final)
 
-    with Simulator(model, dt=dt) as loihi_sim:
-        loihi_sim.run(t_final)
-
-    rates_nengosim = np.sum(nengo_sim.data[probe] > 0, axis=0) / t_final
-    rates_loihisim = np.sum(loihi_sim.data[probe] > 0, axis=0) / t_final
-
-    rates_ref = neuron_type.rates(0.0, gain, bias, dt=dt).squeeze()
-    plt.plot(bias, rates_loihisim, "r", label="loihi sim")
-    plt.plot(bias, rates_nengosim, "b-.", label="nengo sim")
-    plt.plot(bias, rates_ref, "k--", label="ref")
-    plt.legend(loc="best")
+    rates_nengosim = nengo_sim.data[probe].mean(axis=0)
 
-    assert rates_ref.shape == rates_nengosim.shape == rates_loihisim.shape
-    atol = 1.0 / t_final  # the fundamental unit for our rates
-    assert allclose(rates_nengosim, rates_ref, atol=atol, rtol=0, xtol=1)
-    assert allclose(rates_loihisim, rates_ref, atol=atol, rtol=0, xtol=1)
+    rates_dlsim = None
+    if HAS_DL:
+        with nengo_dl.Simulator(model, dt=dt) as dl_sim:
+            dl_sim.run(t_final)
 
+        rates_dlsim = dl_sim.data[probe].mean(axis=0)
 
-@pytest.mark.skipif(not HAS_DL, reason="requires nengo-dl")
-@pytest.mark.parametrize("neuron_type", [LoihiLIF(), LoihiSpikingRectifiedLinear()])
-@pytest.mark.parametrize("inference_only", (True, False))
-def test_nengo_dl_neurons(neuron_type, inference_only, Simulator, plt, allclose):
-    install_dl_builders()
+    rates_loihisim = None
+    if type(neuron_type) in (LoihiLIF, LoihiSpikingRectifiedLinear):
+        with Simulator(model, dt=dt) as loihi_sim:
+            loihi_sim.run(t_final)
 
-    dt = 0.0007
-
-    n = 256
-    encoders = np.ones((n, 1))
-    gain = np.zeros(n)
-    if isinstance(neuron_type, nengo.SpikingRectifiedLinear):
-        bias = np.linspace(0, 1001, n)
-    else:
-        bias = np.linspace(0, 30, n)
-
-    with nengo.Network() as model:
-        nengo_dl.configure_settings(inference_only=inference_only)
-
-        a = nengo.Ensemble(
-            n, 1, neuron_type=neuron_type, encoders=encoders, gain=gain, bias=bias
-        )
-        ap = nengo.Probe(a.neurons)
-
-    t_final = 1.0
-    with nengo_dl.Simulator(model, dt=dt) as dl_sim:
-        dl_sim.run(t_final)
-
-    with Simulator(model, dt=dt) as loihi_sim:
-        loihi_sim.run(t_final)
-
-    rates_dlsim = (dl_sim.data[ap] > 0).sum(axis=0) / t_final
-    rates_loihisim = (loihi_sim.data[ap] > 0).sum(axis=0) / t_final
+        rates_loihisim = loihi_sim.data[probe].mean(axis=0)
 
     zeros = np.zeros((1, gain.size))
     rates_ref = neuron_type.rates(zeros, gain, bias, dt=dt).squeeze(axis=0)
-    plt.plot(bias, rates_loihisim, "r", label="loihi sim")
-    plt.plot(bias, rates_dlsim, "b-.", label="dl sim")
+
+    # plot
+    if rates_loihisim is not None:
+        plt.plot(bias, rates_loihisim, "r", label="loihi sim")
+    if rates_dlsim is not None:
+        plt.plot(bias, rates_dlsim, "g-.", label="dl sim")
+    plt.plot(bias, rates_nengosim, "b:", label="nengo sim")
     plt.plot(bias, rates_ref, "k--", label="rates_ref")
     plt.legend(loc="best")
 
     atol = 1.0 / t_final  # the fundamental unit for our rates
-    assert rates_ref.shape == rates_dlsim.shape == rates_loihisim.shape
-    assert allclose(rates_dlsim, rates_ref, atol=atol, rtol=0, xtol=1)
-    assert allclose(rates_loihisim, rates_ref, atol=atol, rtol=0, xtol=1)
+    assert rates_ref.shape == rates_nengosim.shape
+    assert allclose(rates_nengosim, rates_ref, atol=atol, rtol=0, xtol=1)
+    if rates_dlsim is not None:
+        assert rates_ref.shape == rates_dlsim.shape
+        assert allclose(rates_dlsim, rates_ref, atol=atol, rtol=0, xtol=1)
+    if rates_loihisim is not None:
+        assert rates_ref.shape == rates_loihisim.shape
+        assert allclose(rates_loihisim, rates_ref, atol=atol, rtol=0, xtol=1)
 
 
 def test_lif_min_voltage(Simulator, plt, allclose):