From c19fb0d9193894cf8da2a9ac811b276adc4d889c Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <erichuns@gmail.com>
Date: Wed, 30 Oct 2019 14:36:04 -0400
Subject: [PATCH 01/24] Ensure scipy.sparse is properly imported

Some import statements just imported `scipy` when we needed
`scipy.sparse`. Import order differences made this an occasional
bug. Fixes #252.
---
 nengo_loihi/block.py                            | 2 +-
 nengo_loihi/builder/connection.py               | 2 +-
 nengo_loihi/builder/sparse_matrix.py            | 2 +-
 nengo_loihi/builder/tests/test_sparse_matrix.py | 2 +-
 nengo_loihi/compat.py                           | 2 +-
 nengo_loihi/tests/test_connection.py            | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/nengo_loihi/block.py b/nengo_loihi/block.py
index 96509b0c2..5201b5c56 100644
--- a/nengo_loihi/block.py
+++ b/nengo_loihi/block.py
@@ -3,7 +3,7 @@
 from nengo.exceptions import BuildError
 
 import numpy as np
-import scipy
+import scipy.sparse
 
 from nengo_loihi.nxsdk_obfuscation import d
 
diff --git a/nengo_loihi/builder/connection.py b/nengo_loihi/builder/connection.py
index cf7a58167..14d7630cb 100644
--- a/nengo_loihi/builder/connection.py
+++ b/nengo_loihi/builder/connection.py
@@ -15,7 +15,7 @@
 from nengo.exceptions import BuildError, ValidationError
 from nengo.solvers import NoSolver, Solver
 import numpy as np
-import scipy
+import scipy.sparse
 
 from nengo_loihi.block import Axon, LoihiBlock, Probe, Synapse
 from nengo_loihi.builder.builder import Builder
diff --git a/nengo_loihi/builder/sparse_matrix.py b/nengo_loihi/builder/sparse_matrix.py
index 8e0610d56..741d6ae3c 100644
--- a/nengo_loihi/builder/sparse_matrix.py
+++ b/nengo_loihi/builder/sparse_matrix.py
@@ -1,5 +1,5 @@
 import numpy as np
-import scipy
+import scipy.sparse
 
 
 def expand_matrix(matrix, shape):
diff --git a/nengo_loihi/builder/tests/test_sparse_matrix.py b/nengo_loihi/builder/tests/test_sparse_matrix.py
index a13c5f11e..d3f36135e 100644
--- a/nengo_loihi/builder/tests/test_sparse_matrix.py
+++ b/nengo_loihi/builder/tests/test_sparse_matrix.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pytest
-import scipy
+import scipy.sparse
 
 from nengo_loihi.builder.sparse_matrix import (
     expand_matrix,
diff --git a/nengo_loihi/compat.py b/nengo_loihi/compat.py
index 211d5cad4..3eee46cd7 100644
--- a/nengo_loihi/compat.py
+++ b/nengo_loihi/compat.py
@@ -3,7 +3,7 @@
 
 import nengo
 import numpy as np
-import scipy
+import scipy.sparse
 
 logger = logging.getLogger(__name__)
 
diff --git a/nengo_loihi/tests/test_connection.py b/nengo_loihi/tests/test_connection.py
index 8b7b8967b..3316ecf04 100644
--- a/nengo_loihi/tests/test_connection.py
+++ b/nengo_loihi/tests/test_connection.py
@@ -3,7 +3,7 @@
 from nengo.utils.matplotlib import rasterplot
 import numpy as np
 import pytest
-import scipy
+import scipy.sparse
 
 from nengo_loihi.builder import connection
 from nengo_loihi.compat import nengo_transforms

From 26140ee405127c31042a19fce5b7579f90b67319 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <erichuns@gmail.com>
Date: Wed, 30 Oct 2019 14:22:43 -0400
Subject: [PATCH 02/24] Run tests in python 3.6 by default

This allows us to do a proper `bones-check` with `black`.
The hardware tests are still in 3.5.2 to support NxSDK.

This commit also fixes some slight changes by `nengo-bones` 0.6.0
that were missed in the upgrade commit because of the missing
`bones-check`.
---
 .nengobones.yml |  5 +++--
 .travis.yml     |  3 ++-
 docs/conf.py    |  5 ++++-
 setup.py        | 10 +++++++---
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/.nengobones.yml b/.nengobones.yml
index f03803903..897932f13 100644
--- a/.nengobones.yml
+++ b/.nengobones.yml
@@ -437,7 +437,7 @@ docs_conf_py:
   analytics_id: UA-41658423-2
 
 travis_yml:
-  python: 3.5.2
+  python: 3.6
   global_vars:
     NENGO_VERSION: nengo[tests]
     NENGO_DL_VERSION: nengo-dl
@@ -452,6 +452,7 @@ travis_yml:
         NENGO_VERSION: git+https://github.com/nengo/nengo.git#egg=nengo[tests]
         NENGO_DL_VERSION: git+https://github.com/nengo/nengo-dl.git#egg=nengo-dl
     - script: hardware
+      python: 3.5.2  # nxsdk requires 3.5.2
       env:
         NENGO_VERSION: git+https://github.com/nengo/nengo.git#egg=nengo[tests]
         NENGO_DL_VERSION: git+https://github.com/nengo/nengo-dl.git#egg=nengo-dl
@@ -472,7 +473,7 @@ ci_scripts:
       - $NENGO_VERSION
       - $NENGO_DL_VERSION
       - jupyter
-      - numpy
+      - numpy>=1.14  # avoid the default-installed 1.13 on TravisCI
     coverage: true
     nengo_tests: true
   - template: static
diff --git a/.travis.yml b/.travis.yml
index d46e3f8ab..aa1dfbc45 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,7 @@
 # Automatically generated by nengo-bones, do not edit this file directly
 
 language: python
-python: 3.5.2
+python: 3.6
 notifications:
   email:
     on_success: change
@@ -43,6 +43,7 @@ jobs:
       NENGO_DL_VERSION="git+https://github.com/nengo/nengo-dl.git#egg=nengo-dl"
       NXSDK_VERSION="0.8.5"
       SCRIPT="hardware"
+    python: 3.5.2
   -
     env:
       NENGO_VERSION="git+https://github.com/nengo/nengo.git#egg=nengo[tests]"
diff --git a/docs/conf.py b/docs/conf.py
index 4c16775e5..8528336fd 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -43,7 +43,10 @@
 
 # -- sphinx
 nitpicky = True
-exclude_patterns = ["_build", "**/.ipynb_checkpoints"]
+exclude_patterns = [
+    "_build",
+    "**/.ipynb_checkpoints",
+]
 linkcheck_timeout = 30
 source_suffix = ".rst"
 source_encoding = "utf-8"
diff --git a/setup.py b/setup.py
index 98ce1648e..b3a501fd2 100755
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,11 @@ def read(*filenames, **kwargs):
 root = os.path.dirname(os.path.realpath(__file__))
 version = runpy.run_path(os.path.join(root, "nengo_loihi", "version.py"))["version"]
 
-install_req = ["jinja2", "nengo>=2.8.0", "scipy>=1.2.1"]
+install_req = [
+    "jinja2",
+    "nengo>=2.8.0",
+    "scipy>=1.2.1",
+]
 docs_req = [
     "abr_control",
     "jupyter",
@@ -76,8 +80,8 @@ def read(*filenames, **kwargs):
         "tests": tests_req,
     },
     python_requires=">=3.4",
-    entry_points={"nengo.backends": ["loihi = nengo_loihi:Simulator"]},
-    package_data={"nengo_loihi": ["nengo_loihi/snips/*"]},
+    entry_points={"nengo.backends": ["loihi = nengo_loihi:Simulator",],},
+    package_data={"nengo_loihi": ["nengo_loihi/snips/*",],},
     classifiers=[
         "Development Status :: 3 - Alpha",
         "Framework :: Nengo",

From 805778aeeb63b31bec8094c45799df478807376c Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <erichuns@gmail.com>
Date: Wed, 3 Jul 2019 17:01:21 -0400
Subject: [PATCH 03/24] Support NxSDK 0.8.7

Not backwards compatible with previous versions.
---
 .nengobones.yml                              | 2 +-
 .travis.yml                                  | 2 +-
 CHANGES.rst                                  | 2 ++
 nengo_loihi/hardware/builder.py              | 6 +++---
 nengo_loihi/hardware/interface.py            | 4 ++--
 nengo_loihi/hardware/tests/test_interface.py | 2 +-
 6 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/.nengobones.yml b/.nengobones.yml
index 897932f13..2a1a3e9e0 100644
--- a/.nengobones.yml
+++ b/.nengobones.yml
@@ -456,7 +456,7 @@ travis_yml:
       env:
         NENGO_VERSION: git+https://github.com/nengo/nengo.git#egg=nengo[tests]
         NENGO_DL_VERSION: git+https://github.com/nengo/nengo-dl.git#egg=nengo-dl
-        NXSDK_VERSION: 0.8.5
+        NXSDK_VERSION: 0.8.7
     - script: docs
       env:
         NENGO_VERSION: git+https://github.com/nengo/nengo.git#egg=nengo[tests]
diff --git a/.travis.yml b/.travis.yml
index aa1dfbc45..c87378040 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,7 +41,7 @@ jobs:
     env:
       NENGO_VERSION="git+https://github.com/nengo/nengo.git#egg=nengo[tests]"
       NENGO_DL_VERSION="git+https://github.com/nengo/nengo-dl.git#egg=nengo-dl"
-      NXSDK_VERSION="0.8.5"
+      NXSDK_VERSION="0.8.7"
       SCRIPT="hardware"
     python: 3.5.2
   -
diff --git a/CHANGES.rst b/CHANGES.rst
index 2a2883d9a..0ffd7f4c2 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -39,6 +39,8 @@ Release history
   (`#226 <https://github.com/nengo/nengo-loihi/pull/226>`__)
 - The ``scipy`` package is now required to run Nengo Loihi.
   (`#240 <https://github.com/nengo/nengo-loihi/pull/240>`__)
+- Nengo Loihi now requires NxSDK version 0.8.7.
+  (`#234 <https://github.com/nengo/nengo-loihi/pull/234>`__)
 
 0.8.0 (June 23, 2019)
 =====================
diff --git a/nengo_loihi/hardware/builder.py b/nengo_loihi/hardware/builder.py
index 97e4902b5..fc0d6b9a7 100644
--- a/nengo_loihi/hardware/builder.py
+++ b/nengo_loihi/hardware/builder.py
@@ -50,9 +50,9 @@ def build_board(board, seed=None):
 
 
 def build_chip(nxsdk_chip, chip, seed=None):
-    assert len(chip.cores) == len(d_get(nxsdk_chip, b"bjJDb3Jlcw=="))
+    assert len(chip.cores) == len(d_get(nxsdk_chip, b"bjJDb3Jlc0FzTGlzdA=="))
     rng = np.random.RandomState(seed)
-    for core, nxsdk_core in zip(chip.cores, d_get(nxsdk_chip, b"bjJDb3Jlcw==")):
+    for core, nxsdk_core in zip(chip.cores, d_get(nxsdk_chip, b"bjJDb3Jlc0FzTGlzdA==")):
         logger.debug("Building core %s", core)
         seed = rng.randint(npext.maxint)
         build_core(nxsdk_core, core, seed=seed)
@@ -539,7 +539,7 @@ def build_axons(nxsdk_core, core, block, axon, compartment_ids, pop_id_map):
     nxsdk_board = d_get(nxsdk_core, b"cGFyZW50", b"cGFyZW50")
     tchip_id = d_get(d_get(nxsdk_board, b"bjJDaGlwcw==")[tchip_idx], b"aWQ=")
     tcore_id = d_get(
-        d_get(d_get(nxsdk_board, b"bjJDaGlwcw==")[tchip_idx], b"bjJDb3Jlcw==")[
+        d_get(d_get(nxsdk_board, b"bjJDaGlwcw==")[tchip_idx], b"bjJDb3Jlc0FzTGlzdA==")[
             tcore_idx
         ],
         b"aWQ=",
diff --git a/nengo_loihi/hardware/interface.py b/nengo_loihi/hardware/interface.py
index 4822c9d7a..49d94ff73 100644
--- a/nengo_loihi/hardware/interface.py
+++ b/nengo_loihi/hardware/interface.py
@@ -90,8 +90,8 @@ def check_nxsdk_version():
 
         # if installed, check version
         version = LooseVersion(getattr(nxsdk, "__version__", "0.0.0"))
-        minimum = LooseVersion("0.8.5")
-        max_tested = LooseVersion("0.8.5")
+        minimum = LooseVersion("0.8.7")
+        max_tested = LooseVersion("0.8.7")
         if version < minimum:
             raise ImportError(
                 "nengo-loihi requires nxsdk>=%s, found %s" % (minimum, version)
diff --git a/nengo_loihi/hardware/tests/test_interface.py b/nengo_loihi/hardware/tests/test_interface.py
index 3989588de..2dc6ba8c3 100644
--- a/nengo_loihi/hardware/tests/test_interface.py
+++ b/nengo_loihi/hardware/tests/test_interface.py
@@ -27,7 +27,7 @@ def test_error_on_old_version(monkeypatch):
 
 def test_no_warn_on_current_version(monkeypatch):
     mock = MockNxsdk()
-    mock.__version__ = "0.8.5"
+    mock.__version__ = "0.8.7"
 
     monkeypatch.setattr(hardware_interface, "nxsdk", mock)
     monkeypatch.setattr(hardware_interface, "assert_nxsdk", lambda: True)

From ad9a35328794ca5cf4bec8b12da3777c64cc289a Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Tue, 22 Oct 2019 16:41:22 -0400
Subject: [PATCH 04/24] Support NxSDK 0.9.0

---
 .nengobones.yml                              |  2 +-
 .travis.yml                                  |  2 +-
 CHANGES.rst                                  |  2 ++
 nengo_loihi/hardware/interface.py            | 18 ++++++++++--------
 nengo_loihi/hardware/tests/test_interface.py |  2 +-
 5 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/.nengobones.yml b/.nengobones.yml
index 2a1a3e9e0..444070cfc 100644
--- a/.nengobones.yml
+++ b/.nengobones.yml
@@ -456,7 +456,7 @@ travis_yml:
       env:
         NENGO_VERSION: git+https://github.com/nengo/nengo.git#egg=nengo[tests]
         NENGO_DL_VERSION: git+https://github.com/nengo/nengo-dl.git#egg=nengo-dl
-        NXSDK_VERSION: 0.8.7
+        NXSDK_VERSION: 0.9
     - script: docs
       env:
         NENGO_VERSION: git+https://github.com/nengo/nengo.git#egg=nengo[tests]
diff --git a/.travis.yml b/.travis.yml
index c87378040..6c9810c99 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,7 +41,7 @@ jobs:
     env:
       NENGO_VERSION="git+https://github.com/nengo/nengo.git#egg=nengo[tests]"
       NENGO_DL_VERSION="git+https://github.com/nengo/nengo-dl.git#egg=nengo-dl"
-      NXSDK_VERSION="0.8.7"
+      NXSDK_VERSION="0.9"
       SCRIPT="hardware"
     python: 3.5.2
   -
diff --git a/CHANGES.rst b/CHANGES.rst
index 0ffd7f4c2..24cdf9f3a 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -31,6 +31,8 @@ Release history
   (`#240 <https://github.com/nengo/nengo-loihi/pull/240>`__)
 - A more informative error message is raised if any encoders contain NaNs.
   (`#251 <https://github.com/nengo/nengo-loihi/pull/251>`__)
+- Nengo Loihi now supports NxSDK version 0.9.0.
+  (`#255 <https://github.com/nengo/nengo-loihi/pull/255>`__)
 
 **Changed**
 
diff --git a/nengo_loihi/hardware/interface.py b/nengo_loihi/hardware/interface.py
index 49d94ff73..095514022 100644
--- a/nengo_loihi/hardware/interface.py
+++ b/nengo_loihi/hardware/interface.py
@@ -43,6 +43,9 @@ class HardwareInterface:
         Defaults to one block and one input per core on a single chip.
     """
 
+    min_nxsdk_version = LooseVersion("0.8.7")
+    max_nxsdk_version = LooseVersion("0.9.0")
+
     def __init__(
         self,
         model,
@@ -83,24 +86,23 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, traceback):
         self.close()
 
-    @staticmethod
-    def check_nxsdk_version():
+    @classmethod
+    def check_nxsdk_version(cls):
         # raise exception if nxsdk not installed
         assert_nxsdk()
 
         # if installed, check version
         version = LooseVersion(getattr(nxsdk, "__version__", "0.0.0"))
-        minimum = LooseVersion("0.8.7")
-        max_tested = LooseVersion("0.8.7")
-        if version < minimum:
+        if version < cls.min_nxsdk_version:
             raise ImportError(
-                "nengo-loihi requires nxsdk>=%s, found %s" % (minimum, version)
+                "nengo-loihi requires nxsdk>=%s, found %s"
+                % (cls.min_nxsdk_version, version)
             )
-        elif version > max_tested:
+        elif version > cls.max_nxsdk_version:
             warnings.warn(
                 "nengo-loihi has not been tested with your nxsdk "
                 "version (%s); latest fully supported version is "
-                "%s" % (version, max_tested)
+                "%s" % (version, cls.max_nxsdk_version)
             )
 
     def _iter_blocks(self):
diff --git a/nengo_loihi/hardware/tests/test_interface.py b/nengo_loihi/hardware/tests/test_interface.py
index 2dc6ba8c3..458a9ff82 100644
--- a/nengo_loihi/hardware/tests/test_interface.py
+++ b/nengo_loihi/hardware/tests/test_interface.py
@@ -27,7 +27,7 @@ def test_error_on_old_version(monkeypatch):
 
 def test_no_warn_on_current_version(monkeypatch):
     mock = MockNxsdk()
-    mock.__version__ = "0.8.7"
+    mock.__version__ = str(hardware_interface.HardwareInterface.max_nxsdk_version)
 
     monkeypatch.setattr(hardware_interface, "nxsdk", mock)
     monkeypatch.setattr(hardware_interface, "assert_nxsdk", lambda: True)

From 28322f2da181f0a34e9259357ea20b583a2e4194 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Thu, 17 Oct 2019 10:09:38 -0700
Subject: [PATCH 05/24] Allow user to force precompute=False

This is useful for testing SNIPs.
---
 nengo_loihi/simulator.py | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/nengo_loihi/simulator.py b/nengo_loihi/simulator.py
index 78999877c..941ede2a7 100644
--- a/nengo_loihi/simulator.py
+++ b/nengo_loihi/simulator.py
@@ -97,7 +97,7 @@ def __init__(  # noqa: C901
         dt=0.001,
         seed=None,
         model=None,
-        precompute=False,
+        precompute=None,
         target=None,
         progress_bar=None,
         remove_passthrough=True,
@@ -105,7 +105,6 @@ def __init__(  # noqa: C901
     ):
         # initialize values used in __del__ and close() first
         self.closed = True
-        self.precompute = precompute
         self.network = network
         self.sims = OrderedDict()
         self._run_steps = None
@@ -139,7 +138,9 @@ def __init__(  # noqa: C901
 
         # determine how to split the host into one, two or three models
         self.model.split = Split(
-            network, precompute=precompute, remove_passthrough=remove_passthrough
+            network,
+            precompute=False if precompute is None else precompute,
+            remove_passthrough=remove_passthrough,
         )
 
         # Build the network into the model
@@ -154,8 +155,24 @@ def __init__(  # noqa: C901
             self.model.seeded[conn] = False
             self.model.build(conn)
 
-        if len(self.model.host_pre.params):
-            assert precompute
+        # Create host_pre and host simulators if necessary
+        self.precompute = precompute
+        if precompute is None:
+            # If there is no host then all objects must be on the chip,
+            # which is precomputable in the sense that
+            # no communication has to happen with the host.
+            self.precompute = len(self.model.host.params) == 0
+        elif len(self.model.host_pre.params) == 0 and precompute:
+            warnings.warn(
+                "No precomputable objects. Setting precompute=True has no effect."
+            )
+        elif len(self.model.host.params) == 0 and not precompute:
+            warnings.warn(
+                "Model is pre-computable. Setting precompute=False may slow execution."
+            )
+
+        if len(self.model.host_pre.params) > 0:
+            assert self.precompute
             self.sims["host_pre"] = nengo.Simulator(
                 network=None,
                 dt=self.dt,
@@ -163,12 +180,8 @@ def __init__(  # noqa: C901
                 progress_bar=False,
                 optimize=False,
             )
-        elif precompute:
-            warnings.warn(
-                "No precomputable objects. Setting " "precompute=True has no effect."
-            )
 
-        if len(self.model.host.params):
+        if len(self.model.host.params) > 0:
             self.sims["host"] = nengo.Simulator(
                 network=None,
                 dt=self.dt,
@@ -176,13 +189,6 @@ def __init__(  # noqa: C901
                 progress_bar=False,
                 optimize=False,
             )
-        elif not precompute:
-            # If there is no host and precompute=False, then all objects
-            # must be on the chip, which is precomputable in the sense that
-            # no communication has to happen with the host.
-            # We could warn about this, but we want to avoid people having
-            # to specify `precompute` unless they absolutely have to.
-            self.precompute = True
 
         self._probe_outputs = self.model.params
         self.data = ProbeDict(self._probe_outputs)

From 8c10b00a5a5d3a115031ce96f300a3ecd5beb36b Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Fri, 18 Oct 2019 09:53:21 -0700
Subject: [PATCH 06/24] Add wall-time step timers to Simulator

- Add a timer around the `Simulator._run_steps` call, to measure the
  time taken for all steps.
- Connect to the board outside the timing loop, so that this does not
  count towards the step time.
- Add a timer specific to SNIPs, to get the most accurate timing
  (after we call the board run function, so all setup has happened).
---
 nengo_loihi/hardware/interface.py | 32 +++++++++++++++++--------------
 nengo_loihi/simulator.py          | 13 +++++++++++++
 2 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/nengo_loihi/hardware/interface.py b/nengo_loihi/hardware/interface.py
index 095514022..7cad09d6e 100644
--- a/nengo_loihi/hardware/interface.py
+++ b/nengo_loihi/hardware/interface.py
@@ -60,9 +60,6 @@ def __init__(
             )
 
         self.closed = False
-        self.use_snips = use_snips
-        self.check_nxsdk_version()
-
         self.nxsdk_board = None
         self.nengo_io_h2c = None  # IO snip host-to-chip channel
         self.nengo_io_c2h = None  # IO snip chip-to-host channel
@@ -71,14 +68,20 @@ def __init__(
         self._snip_probe_data = collections.OrderedDict()
         self._chip2host_sent_steps = 0
 
+        self.model = model
+        self.use_snips = use_snips
+        self.seed = seed
         # Maximum number of spikes that can be sent through
         # the nengo_io_h2c channel on one timestep.
         self.snip_max_spikes_per_step = snip_max_spikes_per_step
+        self.allocator = allocator
+
+        self.check_nxsdk_version()
 
         # clear cached content from SpikeProbe class attribute
         d_func(SpikeProbe, b"cHJvYmVEaWN0", b"Y2xlYXI=")
 
-        self.build(model, allocator=allocator, seed=seed)
+        self.build()
 
     def __enter__(self):
         return self
@@ -113,10 +116,11 @@ def _iter_probes(self):
             for probe in block.probes:
                 yield probe
 
-    def build(self, model, allocator, seed=None):
-        validate_model(model)
-        self.model = model
-        self.pes_error_scale = getattr(model, "pes_error_scale", 1.0)
+    def build(self):
+        assert self.nxsdk_board is None, "Cannot rebuild model"
+
+        validate_model(self.model)
+        self.pes_error_scale = getattr(self.model, "pes_error_scale", 1.0)
 
         if self.use_snips:
             # tag all probes as being snip-based,
@@ -126,20 +130,20 @@ def build(self, model, allocator, seed=None):
                 self._snip_probe_data[probe] = []
 
         # --- allocate
-        self.board = allocator(self.model)
+        self.board = self.allocator(self.model)
 
         # --- validate
         validate_board(self.board)
 
         # --- build
-        self.nxsdk_board = build_board(self.board, seed=seed)
+        self.nxsdk_board = build_board(self.board, seed=self.seed)
 
-    def run_steps(self, steps, blocking=True):
-        if self.use_snips and self.nengo_io_h2c is None:
+        # --- create snips
+        if self.use_snips:
             self.create_io_snip()
 
-        # NOTE: we need to call connect() after snips are created
-        self.connect()
+    def run_steps(self, steps, blocking=True):
+        self.connect()  # returns immediately if already connected
         d_get(self.nxsdk_board, b"cnVu")(steps, **{d(b"YVN5bmM="): not blocking})
 
     def _chip2host_monitor(self, probes_receivers):
diff --git a/nengo_loihi/simulator.py b/nengo_loihi/simulator.py
index 941ede2a7..042d853bd 100644
--- a/nengo_loihi/simulator.py
+++ b/nengo_loihi/simulator.py
@@ -1,5 +1,6 @@
 from collections import OrderedDict
 import logging
+import timeit
 import traceback
 import warnings
 
@@ -322,6 +323,7 @@ def reset(self, seed=None):
 
         self._n_steps = 0
         self._time = 0
+        self.timers = dict(steps=0.0)
 
         # clear probe data
         for probe in self.model.probes:
@@ -511,13 +513,18 @@ def loihi_precomputed_host_only(steps):
 
         else:
             assert host is not None, "Model is precomputable"
+            self.timers["snips"] = 0
 
             def loihi_bidirectional_with_host(steps):
                 loihi.run_steps(steps, blocking=False)
+                time0 = timeit.default_timer()
                 for _ in range(steps):
                     host.step()
                     self._host2chip(loihi)
                     self._chip2host(loihi)
+                time1 = timeit.default_timer()
+                self.timers["snips"] += time1 - time0
+
                 logger.info("Waiting for run_steps to complete...")
                 loihi.wait_for_completion()
                 logger.info("run_steps completed")
@@ -536,6 +543,11 @@ def run_steps(self, steps):
             raise SimulatorClosed("Simulator cannot run because it is closed.")
 
         self._make_run_steps()
+
+        if "loihi" in self.sims:
+            self.sims["loihi"].connect()  # connect outside timing loop
+
+        time0 = timeit.default_timer()
         try:
             self._run_steps(steps)
         except Exception:
@@ -561,6 +573,7 @@ def run_steps(self, steps):
                 )
             raise
 
+        self.timers["steps"] += timeit.default_timer() - time0
         self._n_steps += steps
         logger.info("Finished running for %d steps", steps)
         self._probe()

From 3b931555b173d917d6476f021c5b6158881b9b81 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Thu, 17 Oct 2019 11:40:31 -0400
Subject: [PATCH 07/24] Only create global spike generator if required

This reduces unnecessary communication with the chip
---
 nengo_loihi/hardware/builder.py   | 8 ++++++--
 nengo_loihi/hardware/interface.py | 4 +++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/nengo_loihi/hardware/builder.py b/nengo_loihi/hardware/builder.py
index fc0d6b9a7..fb0f1c80f 100644
--- a/nengo_loihi/hardware/builder.py
+++ b/nengo_loihi/hardware/builder.py
@@ -22,7 +22,7 @@
 logger = logging.getLogger(__name__)
 
 
-def build_board(board, seed=None):
+def build_board(board, use_snips=False, seed=None):
     n_chips = board.n_chips
     n_cores_per_chip = board.n_cores_per_chip
     n_synapses_per_core = board.n_synapses_per_core
@@ -32,7 +32,7 @@ def build_board(board, seed=None):
 
     # add our own attribute for storing our spike generator
     assert not hasattr(nxsdk_board, "global_spike_generator")
-    nxsdk_board.global_spike_generator = SpikeGen(nxsdk_board)
+    nxsdk_board.global_spike_generator = None if use_snips else SpikeGen(nxsdk_board)
 
     # custom attr for storing SpikeInputs (filled in build_input)
     assert not hasattr(nxsdk_board, "spike_inputs")
@@ -380,6 +380,10 @@ def build_input(nxsdk_core, core, spike_input, compartment_idxs):
 
     # add any pre-existing spikes to spikegen
     for t in spike_input.spike_times():
+        assert (
+            nxsdk_board.global_spike_generator is not None
+        ), "Cannot add pre-existing spikes when using SNIPs (no spike generator)"
+
         spikes = spike_input.spike_idxs(t)
         for spike in loihi_input.spikes_to_loihi(t, spikes):
             assert (
diff --git a/nengo_loihi/hardware/interface.py b/nengo_loihi/hardware/interface.py
index 7cad09d6e..e42d24899 100644
--- a/nengo_loihi/hardware/interface.py
+++ b/nengo_loihi/hardware/interface.py
@@ -136,7 +136,9 @@ def build(self):
         validate_board(self.board)
 
         # --- build
-        self.nxsdk_board = build_board(self.board, seed=self.seed)
+        self.nxsdk_board = build_board(
+            self.board, use_snips=self.use_snips, seed=self.seed
+        )
 
         # --- create snips
         if self.use_snips:

From 1f00d5557e870262917591eb57d7782eb6a0dc81 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Wed, 23 Oct 2019 15:31:00 -0400
Subject: [PATCH 08/24] Add host SNIP for faster communication

The host SNIP runs on the host and facilitates communication
with the superhost using sockets. This is faster than using
the default RPC interface.

We also take care to make sure both the host and chip SNIPs
end properly, by sending a message with a negative spike count.
This helps to eliminate board hangs.

To allow the host SNIP to work with multiple `run` calls, we
keep it idling in between `run` calls, waiting for a message.
If the board disconnects before a subsequent run call,
the negative spike count message will tell the host SNIP to stop.
---
 nengo_loihi/compat.py                         |   2 +
 nengo_loihi/hardware/interface.py             | 105 +++++++++++--
 nengo_loihi/hardware/nxsdk_shim.py            |   2 +
 .../hardware/snips/nengo_host.cc.template     | 147 ++++++++++++++++++
 .../hardware/snips/nengo_io.c.template        |  13 +-
 nengo_loihi/hardware/tests/test_interface.py  |  50 ++++++
 nengo_loihi/simulator.py                      |  27 +---
 7 files changed, 308 insertions(+), 38 deletions(-)
 create mode 100644 nengo_loihi/hardware/snips/nengo_host.cc.template

diff --git a/nengo_loihi/compat.py b/nengo_loihi/compat.py
index 3eee46cd7..bcc7f61ba 100644
--- a/nengo_loihi/compat.py
+++ b/nengo_loihi/compat.py
@@ -12,6 +12,7 @@
     from nengo.builder.network import seed_network
     from nengo.builder.transforms import multiply
     import nengo.transforms as nengo_transforms
+    from nengo.utils.testing import signals_allclose
 
     def conn_solver(solver, activities, targets, rng):
         return solver(activities, targets, rng=rng)
@@ -36,6 +37,7 @@ def make_process_step(process, shape_in, shape_out, dt, rng, dtype=None):
 
 else:
     from nengo.builder.connection import multiply
+    from nengo.utils.testing import allclose as signals_allclose
 
     nengo_transforms = None
     from nengo.dists import get_samples as _get_samples
diff --git a/nengo_loihi/hardware/interface.py b/nengo_loihi/hardware/interface.py
index e42d24899..e302a54f8 100644
--- a/nengo_loihi/hardware/interface.py
+++ b/nengo_loihi/hardware/interface.py
@@ -3,6 +3,8 @@
 import logging
 import os
 import shutil
+import socket
+import struct
 import tempfile
 import time
 import warnings
@@ -17,13 +19,21 @@
 from nengo_loihi.hardware.allocators import OneToOne, RoundRobin
 from nengo_loihi.hardware.builder import build_board
 from nengo_loihi.nxsdk_obfuscation import d, d_func, d_get
-from nengo_loihi.hardware.nxsdk_shim import assert_nxsdk, nxsdk, SpikeProbe
+from nengo_loihi.hardware.nxsdk_shim import assert_nxsdk, nxsdk, SnipPhase, SpikeProbe
 from nengo_loihi.hardware.validate import validate_board
 from nengo_loihi.validate import validate_model
 
 logger = logging.getLogger(__name__)
 
 
+def ceil_div(a, b):
+    return -((-a) // b)
+
+
+def roundup(a, b):
+    return b * ceil_div(a, b)
+
+
 class HardwareInterface:
     """Place a Model onto a Loihi board and run it.
 
@@ -63,6 +73,9 @@ def __init__(
         self.nxsdk_board = None
         self.nengo_io_h2c = None  # IO snip host-to-chip channel
         self.nengo_io_c2h = None  # IO snip chip-to-host channel
+        self.host_socket = None  # IO snip superhost (this) <-> host socket
+        self.host_socket_connected = False
+        self.host_socket_port = None
         self._probe_filters = {}
         self._probe_filter_pos = {}
         self._snip_probe_data = collections.OrderedDict()
@@ -148,6 +161,22 @@ def run_steps(self, steps, blocking=True):
         self.connect()  # returns immediately if already connected
         d_get(self.nxsdk_board, b"cnVu")(steps, **{d(b"YVN5bmM="): not blocking})
 
+        # pause to allow host snip to start and listen for connection
+        time.sleep(0.1)
+
+        # connect to host socket
+        if self.host_socket is not None and not self.host_socket_connected:
+            host_address = self.nxsdk_board.executor._host_coordinator.hostAddr
+            print(
+                "Connecting to host socket at (%s, %s)"
+                % (host_address, self.host_socket_port)
+            )
+            self.host_socket.connect((host_address, self.host_socket_port))
+            self.host_socket_connected = True
+
+        # send number of steps to host process
+        self.host_socket.send(struct.pack("i", steps))
+
     def _chip2host_monitor(self, probes_receivers):
         increment = None
         for probe, receiver in probes_receivers.items():
@@ -181,9 +210,9 @@ def _chip2host_monitor(self, probes_receivers):
             self._chip2host_sent_steps += increment
 
     def _chip2host_snips(self, probes_receivers):
-        count = self.nengo_io_c2h_count
-        data = self.nengo_io_c2h.read(count)
-        time_step, data = data[0], np.array(data[1:], dtype=np.int32)
+        data = self.host_socket.recv(4 * self.nengo_io_c2h_count)
+        data = np.frombuffer(data, dtype=np.int32)
+        time_step, data = data[0], data[1:]
         snip_range = self.nengo_io_snip_range
 
         for probe in self._snip_probe_data:
@@ -237,6 +266,8 @@ def _host2chip_spikegen(self, loihi_spikes):
             )
 
     def _host2chip_snips(self, loihi_spikes, loihi_errors):
+        assert self.host_socket_connected
+
         max_spikes = self.snip_max_spikes_per_step
         if len(loihi_spikes) > max_spikes:
             warnings.warn(
@@ -255,8 +286,15 @@ def _host2chip_snips(self, loihi_spikes, loihi_errors):
         assert len(loihi_errors) == self.nengo_io_h2c_errors
         for error in loihi_errors:
             msg.extend(error)
-        assert len(msg) <= self.nengo_io_h2c.numElements
-        self.nengo_io_h2c.write(len(msg), msg)
+
+        i = 0
+        send_size = 1024
+        while i < len(msg):
+            msg_i = msg[i : i + send_size]
+            msg_bytes = struct.pack("%di" % len(msg_i), *msg_i)
+            bytes_sent = self.host_socket.send(msg_bytes)
+            assert bytes_sent == len(msg_bytes), "Host socket send failed"
+            i += len(msg_i)
 
     def host2chip(self, spikes, errors):
         loihi_spikes = []
@@ -316,6 +354,16 @@ def connect(self, attempts=10):
             raise SimulationError("Could not connect to the board")
 
     def close(self):
+        if self.host_socket is not None and self.host_socket_connected:
+            # send -1 to signal host/chip that we're done
+            self.host_socket.send(struct.pack("i", -1))
+
+            # pause to allow chip to receive -1 signal via host
+            time.sleep(0.1)
+
+            self.host_socket.close()
+            self.host_socket_connected = False
+
         if self.nxsdk_board is not None:
             d_func(self.nxsdk_board, b"ZGlzY29ubmVjdA==")
 
@@ -447,10 +495,12 @@ def create_io_snip(self):
             len(cores),
             len(probes),
         )
+        chip_buffer_size = max(1024, roundup(n_outputs, SpikePacker.size()))
         code = template.render(
             n_outputs=n_outputs,
             n_errors=n_errors,
             max_error_len=max_error_len,
+            buffer_size=chip_buffer_size,
             cores=cores,
             probes=probes,
             obfs=obfs,
@@ -463,8 +513,8 @@ def create_io_snip(self):
         with open(os.path.join(snips_dir, "nengo_learn.c"), "w") as f:
             f.write(code)
 
-        # --- create SNIP process and channels
-        logger.debug("Creating nengo_io snip process")
+        # --- create chip processes
+        logger.debug("Creating nengo_io chip process")
         nengo_io = d_func(
             self.nxsdk_board,
             b"Y3JlYXRlUHJvY2Vzcw==",
@@ -477,7 +527,7 @@ def create_io_snip(self):
                 b"cGhhc2U=": d(b"bWdtdA=="),
             },
         )
-        logger.debug("Creating nengo_learn snip process")
+        logger.debug("Creating nengo_learn chip process")
         c_path = os.path.join(self.tmp_snip_dir.name, "nengo_learn.c")
         shutil.copyfile(os.path.join(snips_dir, "nengo_learn.c"), c_path)
         d_func(
@@ -493,6 +543,39 @@ def create_io_snip(self):
             },
         )
 
+        # --- create host process (for faster communication via sockets)
+        host_socket_port = np.random.randint(50000, 60000)
+
+        max_inputs = n_errors + self.snip_max_spikes_per_step * SpikePacker.size()
+        host_template = env.get_template("nengo_host.cc.template")
+        host_path = os.path.join(self.tmp_snip_dir.name, "nengo_host.cc")
+        host_code = host_template.render(
+            host_buffer_size=max(max_inputs, n_outputs),
+            n_outputs=n_outputs,
+            server_port=host_socket_port,
+            input_channel="nengo_io_h2c",
+            output_channel="nengo_io_c2h",
+            obfs=obfs,
+        )
+        with open(host_path, "w") as f:
+            f.write(host_code)
+
+        # make process
+        host_process = d_func(
+            self.nxsdk_board,
+            b"Y3JlYXRlU25pcA==",
+            kwargs={
+                b"cGhhc2U=": SnipPhase.HOST_CONCURRENT_EXECUTION,
+                b"Y3BwRmlsZQ==": host_path,
+            },
+        )
+
+        # connect to host socket
+        print("Making superhost socket")
+        self.host_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self.host_socket_port = host_socket_port
+
+        # --- create channels
         size = (
             1  # first int stores number of spikes
             + self.snip_max_spikes_per_step * SpikePacker.size()
@@ -506,8 +589,8 @@ def create_io_snip(self):
         self.nengo_io_c2h = d_get(self.nxsdk_board, b"Y3JlYXRlQ2hhbm5lbA==")(
             b"nengo_io_c2h", "int", n_outputs
         )
-        d_get(self.nengo_io_h2c, b"Y29ubmVjdA==")(None, nengo_io)
-        d_get(self.nengo_io_c2h, b"Y29ubmVjdA==")(nengo_io, None)
+        d_get(self.nengo_io_h2c, b"Y29ubmVjdA==")(host_process, nengo_io)
+        d_get(self.nengo_io_c2h, b"Y29ubmVjdA==")(nengo_io, host_process)
         self.nengo_io_h2c_errors = n_errors
         self.nengo_io_c2h_count = n_outputs
         self.nengo_io_snip_range = snip_range
diff --git a/nengo_loihi/hardware/nxsdk_shim.py b/nengo_loihi/hardware/nxsdk_shim.py
index 27b8a858c..5eb172481 100644
--- a/nengo_loihi/hardware/nxsdk_shim.py
+++ b/nengo_loihi/hardware/nxsdk_shim.py
@@ -80,9 +80,11 @@ def assert_nxsdk(exception=exception):
         b"bnhzZGsuZ3JhcGgubnhpbnB1dGdlbi5ueGlucHV0Z2Vu", b"QmFzaWNTcGlrZUdlbmVyYXRvcg=="
     )
     SpikeProbe = d_import(b"bnhzZGsuZ3JhcGgubnhwcm9iZXM=", b"TjJTcGlrZVByb2Jl")
+    SnipPhase = d_import(b"bnhzZGsuZ3JhcGgucHJvY2Vzc2VzLnBoYXNlX2VudW1z", b"UGhhc2U=")
 else:
     micro_gen = None
     TraceConfigGenerator = None
     NxsdkBoard = None
     SpikeGen = None
     SpikeProbe = None
+    SnipPhase = None
diff --git a/nengo_loihi/hardware/snips/nengo_host.cc.template b/nengo_loihi/hardware/snips/nengo_host.cc.template
new file mode 100644
index 000000000..e89da447e
--- /dev/null
+++ b/nengo_loihi/hardware/snips/nengo_host.cc.template
@@ -0,0 +1,147 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <string>
+#include <unistd.h>  // usleep
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+
+#include "nxsdkhost.h"
+
+#define N_OUTPUTS {{ n_outputs }}
+//#define BUFFER_SIZE {{ host_buffer_size }}
+#define BUFFER_SIZE 1024
+
+#define SERVER_PORT htons({{ server_port }})
+
+
+namespace nengo_host {
+
+const char input_channel[] = "{{ input_channel }}";
+const char output_channel[] = "{{ output_channel }}";
+
+class NengoHostProcess : public ConcurrentHostSnip {
+    int32_t buffer[BUFFER_SIZE];
+    int server_socket;
+    int client_socket;
+
+  public:
+    NengoHostProcess() {
+        // --- set up socket to communicate with superhost
+        // This machine will act as the server, and the superhost will
+        // connect in as the client
+        server_socket = socket(AF_INET, SOCK_STREAM, 0);
+
+        sockaddr_in server_address;
+        server_address.sin_family = AF_INET;
+        server_address.sin_port = SERVER_PORT;
+        server_address.sin_addr.s_addr = INADDR_ANY;
+
+        bind(server_socket, (struct sockaddr*)&server_address, sizeof(struct sockaddr));
+
+        // wait for a client
+        std::cout << "Host snip listening for client" << std::endl;
+        listen(server_socket, 1);
+
+        // get incoming client connection
+        sockaddr_in client_address;
+        socklen_t sin_size = sizeof(struct sockaddr_in);
+        client_socket = accept(
+            server_socket, (struct sockaddr*)&client_address, &sin_size
+        );
+
+        std::cout << "Host snip connected to client" << std::endl;
+    }
+
+    ~NengoHostProcess() {
+        std::cout << "Closing host (server) socket" << std::endl;
+        close(server_socket);
+    }
+
+    void run(std::atomic_bool& end_of_execution) override {
+        // --- loop to transmit data from superhost to chip and vice versa
+        int step_counter = -1;
+
+        while (!end_of_execution) {
+            // Wait until host sends number of run steps
+            while (!end_of_execution && step_counter <= 0) {
+                ssize_t n_read0 = recv(client_socket, buffer, sizeof(int32_t), MSG_DONTWAIT);
+                if (n_read0 == 4) {
+                    step_counter = buffer[0];
+                    if (step_counter > 0) {
+                        std::cout << "Host running " << step_counter << " steps" << std::endl;
+                    }
+                    break;
+                } else if (n_read0 > 0) {
+                    std::cout << "Host received unexpected number of bytes ("
+                              << n_read0 << ")" << std::endl;
+                    return;
+                } else {
+                    usleep(100);
+                }
+            }
+            if (end_of_execution) {
+                break;
+            } else if (step_counter <= 0) {
+                break;
+            }
+
+            // read messages from superhost socket
+            size_t n_read = read(client_socket, buffer, BUFFER_SIZE * sizeof(int32_t));
+            if (n_read % sizeof(int32_t) != 0) {
+                std::cout << "Did not read full int32s from socket" << std::endl;
+                break;
+            }
+            if (buffer[0] < 0) {
+                std::cout << "Host received shutdown signal: " << buffer[0] << std::endl;
+                break;
+            }
+
+            // write packets to chip (PACKET_SIZE == sizeof(int32_t))
+            writeChannel(input_channel, buffer, n_read / sizeof(int32_t));
+
+            // wait until chip has written output
+            while (!end_of_execution && !probeChannel(output_channel)) {
+                usleep(100);
+            }
+            if (end_of_execution) {
+                break;
+            }
+
+            // read chip output
+            readChannel(output_channel, buffer, N_OUTPUTS);
+
+            // write output to superhost socket
+            const size_t write_len = N_OUTPUTS;
+            const size_t write_bytes = write_len * sizeof(int32_t);
+            size_t n_write = write(client_socket, buffer, write_bytes);
+            if (n_write != write_bytes) {
+                std::cout << "Failed write to socket (tried " << write_bytes
+                          << ", wrote " << n_write << " bytes)" << std::endl;
+                break;;
+            }
+
+            // next timestep
+            step_counter--;
+        }
+
+        if (step_counter != 0) {
+            // make sure chip gets shutdown signal
+            buffer[0] = (step_counter > 0) ? -1 : step_counter;
+            writeChannel(input_channel, buffer, 1);
+        }
+    }
+};
+
+}  // namespace nengo_host
+
+using nengo_host::NengoHostProcess;
+
+// Each ConcurrentHostSnip is run within a thread
+// If you have more threads on the host cpu, you can choose to create individual
+// snips for input and output
+REGISTER_SNIP(NengoHostProcess, ConcurrentHostSnip);
diff --git a/nengo_loihi/hardware/snips/nengo_io.c.template b/nengo_loihi/hardware/snips/nengo_io.c.template
index a0b31e6e7..8ad902e0a 100644
--- a/nengo_loihi/hardware/snips/nengo_io.c.template
+++ b/nengo_loihi/hardware/snips/nengo_io.c.template
@@ -9,8 +9,14 @@
 #define SPIKE_SIZE {{ obfs.spike_size }}
 #define ERROR_INFO_SIZE {{ obfs.error_info_size }}
 
+int is_shutdown = 0;  // if true, we've been asked to shut down
+
+inline int min(int a, int b) {
+    return (a < b) ? a : b;
+}
+
 int guard_io(runState *s) {
-    return 1;
+    return !is_shutdown;
 }
 
 void nengo_io(runState *s) {
@@ -41,6 +47,11 @@ void nengo_io(runState *s) {
     }
 
     {{ obfs.read }}(in_channel, count, 1);
+    if (count < 0) {
+        printf("Chip received shutdown signal: %d\n", count);
+        is_shutdown = 1;
+        return;
+    }
     if (DEBUG) {
         printf("count %d\n", count[0]);
     }
diff --git a/nengo_loihi/hardware/tests/test_interface.py b/nengo_loihi/hardware/tests/test_interface.py
index 458a9ff82..20355eff4 100644
--- a/nengo_loihi/hardware/tests/test_interface.py
+++ b/nengo_loihi/hardware/tests/test_interface.py
@@ -1,8 +1,11 @@
 import nengo
 from nengo.exceptions import SimulationError
+import numpy as np
 import pytest
 
+import nengo_loihi
 from nengo_loihi.block import Axon, LoihiBlock, Synapse
+from nengo_loihi.compat import signals_allclose
 from nengo_loihi.builder import Model
 from nengo_loihi.discretize import discretize_model
 from nengo_loihi.hardware import interface as hardware_interface
@@ -128,3 +131,50 @@ def test_snip_input_count(Simulator, seed, plt):
     with Simulator(model) as sim:
         with pytest.warns(UserWarning, match="Too many spikes"):
             sim.run(0.01)
+
+
+@pytest.mark.target_loihi
+def test_no_hang_on_exception(Simulator, seed):
+    def input_fn(t):
+        if 0.03 < t < 0.05:
+            raise RuntimeError("test_no_hang")
+
+        return t
+
+    with nengo.Network(seed=seed) as net:
+        u = nengo.Node(input_fn)
+        a = nengo.Ensemble(100, 1)
+        nengo.Connection(u, a)
+        nengo.Probe(a)
+
+    with pytest.raises(RuntimeError, match="test_no_hang"):
+        with nengo_loihi.Simulator(net, precompute=False) as sim:
+            sim.run(0.1)
+
+
+@pytest.mark.target_loihi
+def test_multiple_run_calls(Simulator, seed, plt):
+    """Test that we can re-start the SNIPs for a second `run` call"""
+    # NOTE: probe synapses get applied at the end of `run`, so we keep probe
+    # synapses off and do our own filtering.
+
+    f = lambda t: np.sin(8 * np.pi * t)
+
+    with nengo.Network(seed=seed) as net:
+        nengo_loihi.set_defaults()
+
+        u = nengo.Node(f)
+        a = nengo.Ensemble(100, 1)
+        nengo.Connection(u, a, synapse=None)
+        ap = nengo.Probe(a)
+
+    with nengo_loihi.Simulator(net, precompute=False) as sim:
+        sim.run(0.1)
+        sim.run(0.15)
+
+    t = sim.trange()
+    x = f(sim.trange())
+    y = nengo.synapses.Alpha(0.01).filt(sim.data[ap], dt=sim.dt)
+    plt.plot(t, x)
+    plt.plot(t, y)
+    assert signals_allclose(t, x, y, atol=0.07, rtol=0.07, delay=0.024, plt=plt)
diff --git a/nengo_loihi/simulator.py b/nengo_loihi/simulator.py
index 042d853bd..2483450d6 100644
--- a/nengo_loihi/simulator.py
+++ b/nengo_loihi/simulator.py
@@ -1,7 +1,6 @@
 from collections import OrderedDict
 import logging
 import timeit
-import traceback
 import warnings
 
 import nengo
@@ -17,7 +16,6 @@
 from nengo_loihi.discretize import discretize_model
 from nengo_loihi.emulator import EmulatorInterface
 from nengo_loihi.hardware import HardwareInterface, HAS_NXSDK
-from nengo_loihi.nxsdk_obfuscation import d_func
 from nengo_loihi.splitter import Split
 
 logger = logging.getLogger(__name__)
@@ -548,30 +546,7 @@ def run_steps(self, steps):
             self.sims["loihi"].connect()  # connect outside timing loop
 
         time0 = timeit.default_timer()
-        try:
-            self._run_steps(steps)
-        except Exception:
-            if "loihi" in self.sims and self.sims["loihi"].use_snips:
-                # Need to write to board, otherwise it will wait indefinitely
-                h2c = self.sims["loihi"].nengo_io_h2c
-                c2h = self.sims["loihi"].nengo_io_c2h
-
-                print(traceback.format_exc())
-                print("\nAttempting to end simulation...")
-
-                for _ in range(steps):
-                    h2c.write(h2c.numElements, [0] * h2c.numElements)
-                    c2h.read(c2h.numElements)
-                self.sims["loihi"].wait_for_completion()
-                d_func(
-                    self.sims["loihi"].nxsdk_board,
-                    b"bnhEcml2ZXI=",
-                    b"c3RvcEV4ZWN1dGlvbg==",
-                )
-                d_func(
-                    self.sims["loihi"].nxsdk_board, b"bnhEcml2ZXI=", b"c3RvcERyaXZlcg=="
-                )
-            raise
+        self._run_steps(steps)
 
         self.timers["steps"] += timeit.default_timer() - time0
         self._n_steps += steps

From 24edd657fd297d86411059b094ad388342eef573 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Thu, 17 Oct 2019 10:44:33 -0700
Subject: [PATCH 09/24] Larger channel packet size

This improves performance by reducing the number of channel reads.
---
 nengo_loihi/hardware/interface.py             |  43 +++++-
 .../hardware/snips/nengo_host.cc.template     |  17 ++-
 .../hardware/snips/nengo_io.c.template        | 142 ++++++++++++++----
 3 files changed, 158 insertions(+), 44 deletions(-)

diff --git a/nengo_loihi/hardware/interface.py b/nengo_loihi/hardware/interface.py
index e302a54f8..da0970125 100644
--- a/nengo_loihi/hardware/interface.py
+++ b/nengo_loihi/hardware/interface.py
@@ -55,6 +55,7 @@ class HardwareInterface:
 
     min_nxsdk_version = LooseVersion("0.8.7")
     max_nxsdk_version = LooseVersion("0.9.0")
+    channel_packet_size = 64  # size of channel packets in int32s
 
     def __init__(
         self,
@@ -424,6 +425,7 @@ def create_io_snip(self):
         n_errors = 0
         total_error_len = 0
         max_error_len = 0
+        assert len(self.board.chips) == 1, "Learning not implemented for multiple chips"
         for core in self.board.chips[0].cores:  # TODO: don't assume 1 chip
             if core.learning_coreid:
                 error_len = core.blocks[0].n_neurons // 2
@@ -431,6 +433,8 @@ def create_io_snip(self):
                 n_errors += 1
                 total_error_len += 2 + error_len
 
+        print("n_errors: %d" % n_errors)
+
         n_outputs = 1
         probes = []
         cores = set()
@@ -460,7 +464,7 @@ def create_io_snip(self):
             get_channel=d(b"Z2V0Q2hhbm5lbElE"),
             int_type=d(b"aW50MzJfdA=="),
             spike_size=d(b"Mg=="),
-            error_info_size=d(b"Mg=="),
+            error_info_size=d(b"Mg==", int),
             step=d(b"dGltZV9zdGVw"),
             read=d(b"cmVhZENoYW5uZWw="),
             write=d(b"d3JpdGVDaGFubmVs"),
@@ -483,6 +487,8 @@ def create_io_snip(self):
             ),
         )
 
+        n_output_packets = ceil_div(n_outputs, self.channel_packet_size)
+
         # --- write c file using template
         template = env.get_template("nengo_io.c.template")
         self.tmp_snip_dir = tempfile.TemporaryDirectory()
@@ -495,12 +501,21 @@ def create_io_snip(self):
             len(cores),
             len(probes),
         )
-        chip_buffer_size = max(1024, roundup(n_outputs, SpikePacker.size()))
+        chip_buffer_size = roundup(
+            max(
+                n_outputs,  # currently, buffer needs to hold all outputs at once
+                self.channel_packet_size
+                + max(SpikePacker.size(), obfs["error_info_size"]),
+            ),
+            self.channel_packet_size,
+        )
         code = template.render(
             n_outputs=n_outputs,
+            n_output_packets=n_output_packets,
             n_errors=n_errors,
             max_error_len=max_error_len,
             buffer_size=chip_buffer_size,
+            packet_size=self.channel_packet_size,
             cores=cores,
             probes=probes,
             obfs=obfs,
@@ -547,14 +562,18 @@ def create_io_snip(self):
         host_socket_port = np.random.randint(50000, 60000)
 
         max_inputs = n_errors + self.snip_max_spikes_per_step * SpikePacker.size()
+        host_buffer_size = roundup(max(max_inputs, n_outputs), self.channel_packet_size)
+
         host_template = env.get_template("nengo_host.cc.template")
         host_path = os.path.join(self.tmp_snip_dir.name, "nengo_host.cc")
         host_code = host_template.render(
-            host_buffer_size=max(max_inputs, n_outputs),
+            host_buffer_size=host_buffer_size,
             n_outputs=n_outputs,
+            n_output_packets=n_output_packets,
             server_port=host_socket_port,
             input_channel="nengo_io_h2c",
             output_channel="nengo_io_c2h",
+            packet_size=self.channel_packet_size,
             obfs=obfs,
         )
         with open(host_path, "w") as f:
@@ -576,18 +595,28 @@ def create_io_snip(self):
         self.host_socket_port = host_socket_port
 
         # --- create channels
-        size = (
+        input_channel_size = (
             1  # first int stores number of spikes
             + self.snip_max_spikes_per_step * SpikePacker.size()
             + total_error_len
         )
-        logger.debug("Creating nengo_io_h2c channel (%d)" % size)
+        logger.debug("Creating nengo_io_h2c channel (%d)" % input_channel_size)
         self.nengo_io_h2c = d_get(self.nxsdk_board, b"Y3JlYXRlQ2hhbm5lbA==")(
-            b"nengo_io_h2c", "int", size
+            b"nengo_io_h2c",
+            **{
+                d(b"bnVtRWxlbWVudHM="): input_channel_size,
+                d(b"bWVzc2FnZVNpemU="): 4 * self.channel_packet_size,
+                d(b"c2xhY2s="): 16,  # size of buffer on chip (in packets)
+            },
         )
         logger.debug("Creating nengo_io_c2h channel (%d)" % n_outputs)
         self.nengo_io_c2h = d_get(self.nxsdk_board, b"Y3JlYXRlQ2hhbm5lbA==")(
-            b"nengo_io_c2h", "int", n_outputs
+            b"nengo_io_c2h",
+            **{
+                d(b"bnVtRWxlbWVudHM="): n_outputs,
+                d(b"bWVzc2FnZVNpemU="): 4 * self.channel_packet_size,
+                d(b"c2xhY2s="): 16,  # size of buffer on chip (in packets)
+            },
         )
         d_get(self.nengo_io_h2c, b"Y29ubmVjdA==")(host_process, nengo_io)
         d_get(self.nengo_io_c2h, b"Y29ubmVjdA==")(nengo_io, host_process)
diff --git a/nengo_loihi/hardware/snips/nengo_host.cc.template b/nengo_loihi/hardware/snips/nengo_host.cc.template
index e89da447e..1c0c8ab49 100644
--- a/nengo_loihi/hardware/snips/nengo_host.cc.template
+++ b/nengo_loihi/hardware/snips/nengo_host.cc.template
@@ -13,14 +13,20 @@
 #include "nxsdkhost.h"
 
 #define N_OUTPUTS {{ n_outputs }}
-//#define BUFFER_SIZE {{ host_buffer_size }}
-#define BUFFER_SIZE 1024
+#define N_OUTPUT_PACKETS {{ n_output_packets }}
+#define BUFFER_SIZE {{ host_buffer_size }}
+#define PACKET_SIZE {{ packet_size }}
 
 #define SERVER_PORT htons({{ server_port }})
 
 
 namespace nengo_host {
 
+template <class T>
+inline T ceil_div(T a, T b) {
+    return (a / b) + (a % b != 0);
+}
+
 const char input_channel[] = "{{ input_channel }}";
 const char output_channel[] = "{{ output_channel }}";
 
@@ -91,6 +97,8 @@ class NengoHostProcess : public ConcurrentHostSnip {
             }
 
             // read messages from superhost socket
+            // we make sure BUFFER_SIZE is large enough for any message
+            // TODO: read in blocks of e.g 4096 bytes (small power of 2) for speed
             size_t n_read = read(client_socket, buffer, BUFFER_SIZE * sizeof(int32_t));
             if (n_read % sizeof(int32_t) != 0) {
                 std::cout << "Did not read full int32s from socket" << std::endl;
@@ -102,7 +110,8 @@ class NengoHostProcess : public ConcurrentHostSnip {
             }
 
             // write packets to chip (PACKET_SIZE == sizeof(int32_t))
-            writeChannel(input_channel, buffer, n_read / sizeof(int32_t));
+            size_t n_packets = ceil_div<size_t>(n_read, 4*PACKET_SIZE);
+            writeChannel(input_channel, buffer, n_packets);
 
             // wait until chip has written output
             while (!end_of_execution && !probeChannel(output_channel)) {
@@ -113,7 +122,7 @@ class NengoHostProcess : public ConcurrentHostSnip {
             }
 
             // read chip output
-            readChannel(output_channel, buffer, N_OUTPUTS);
+            readChannel(output_channel, buffer, N_OUTPUT_PACKETS);
 
             // write output to superhost socket
             const size_t write_len = N_OUTPUTS;
diff --git a/nengo_loihi/hardware/snips/nengo_io.c.template b/nengo_loihi/hardware/snips/nengo_io.c.template
index 8ad902e0a..99a3ee556 100644
--- a/nengo_loihi/hardware/snips/nengo_io.c.template
+++ b/nengo_loihi/hardware/snips/nengo_io.c.template
@@ -4,8 +4,10 @@
 
 #define DEBUG 0
 #define N_OUTPUTS {{ n_outputs }}
+#define N_OUTPUT_PACKETS {{ n_output_packets }}
 #define N_ERRORS {{ n_errors }}
-#define MAX_ERROR_LEN {{ max_error_len }}
+#define BUFFER_SIZE {{ buffer_size }}
+#define PACKET_SIZE {{ packet_size }}
 #define SPIKE_SIZE {{ obfs.spike_size }}
 #define ERROR_INFO_SIZE {{ obfs.error_info_size }}
 
@@ -20,6 +22,13 @@ int guard_io(runState *s) {
 }
 
 void nengo_io(runState *s) {
+#if SPIKE_SIZE != 2
+    // SPIKE_SIZE != 2 will require a number of changes to this function.
+    // Search for SPIKE_SIZE == 2 precompiler directives.
+    printf("SPIKE_SIZE == 2 assertion failed\n");
+    return;
+#endif
+
 {% for core in cores %}
     {{ obfs.core_class }} *core{{ core }} = NEURON_PTR((CoreId){ .id={{ core }} });
 {% endfor %}
@@ -30,12 +39,18 @@ void nengo_io(runState *s) {
     {{ obfs.int_type }} axon_type;
     {{ obfs.int_type }} axon_id;
     {{ obfs.int_type }} atom;
-    {{ obfs.int_type }} count[1];
-    {{ obfs.int_type }} spike[SPIKE_SIZE];
-    {{ obfs.int_type }} error_info[ERROR_INFO_SIZE];
-    {{ obfs.int_type }} error_data[MAX_ERROR_LEN];
-    {{ obfs.int_type }} error_index;
-    {{ obfs.int_type }} output[N_OUTPUTS];
+    {{ obfs.int_type }} n_spikes;  // input spike count
+    {{ obfs.int_type }} i_spike;  // input spike position
+    {{ obfs.int_type }} *spike;
+
+    {{ obfs.int_type }} error_index;  // index into error stored in shared data
+    {{ obfs.int_type }} i_error = 0;  // index of error block
+    {{ obfs.int_type }} j_error = 0;  // index of error in error block
+    {{ obfs.int_type }} n_errors = -1;  // number of errors in error block
+
+    {{ obfs.int_type }} buffer[BUFFER_SIZE];
+    {{ obfs.int_type }} buffer_pos;  // current read position in buffer
+    {{ obfs.int_type }} buffer_len;  // current length of info in buffer
 
     if (in_channel == -1 || out_channel == -1) {
         printf("Got an invalid channel ID\n");
@@ -46,29 +61,55 @@ void nengo_io(runState *s) {
         printf("time %d\n", s->{{ obfs.step }});
     }
 
-    {{ obfs.read }}(in_channel, count, 1);
-    if (count < 0) {
-        printf("Chip received shutdown signal: %d\n", count);
+    // read first packet
+    {{ obfs.read }}(in_channel, buffer, 1);
+    buffer_len = PACKET_SIZE;
+    n_spikes = buffer[0];
+    buffer_pos = 1;
+    if (n_spikes < 0) {
+        printf("Chip received shutdown signal: %d\n", n_spikes);
         is_shutdown = 1;
         return;
     }
-    if (DEBUG) {
-        printf("count %d\n", count[0]);
-    }
+#if DEBUG
+    printf("num input spikes: %d\n", n_spikes);
+#endif
 
-    for (int i=0; i < count[0]; i++) {
-        {{ obfs.read }}(in_channel, spike, SPIKE_SIZE);
-        if (DEBUG) {
-            printf("send spike %d.%d\n", spike[0], spike[1]);
+    for (i_spike = 0; i_spike < n_spikes; i_spike++) {
+        // read a new packet if necessary
+        if (buffer_pos + SPIKE_SIZE > buffer_len) {
+            if (buffer_len - buffer_pos > 0) {
+                // part of a spike remains at end of buffer
+#if SPIKE_SIZE == 2
+                buffer[0] = buffer[buffer_pos];
+                buffer_len = 1;
+#endif
+            } else {
+                buffer_len = 0;
+            }
+
+            // read next packet
+            {{ obfs.read }}(in_channel, &buffer[buffer_len], 1);
+            buffer_len += PACKET_SIZE;
+            buffer_pos = 0;
         }
+
+        spike = &buffer[buffer_pos];
+        buffer_pos += SPIKE_SIZE;
+#if DEBUG
+        printf("send spike %d.%d\n", spike[0], spike[1]);
+#endif
+
+#if SPIKE_SIZE == 2
         core_id = ({{ obfs.id_class }}) { .id=(spike[0] >> {{ obfs.spike_shift }}) };
         axon_id = spike[0] & {{ obfs.spike_mask }};
         axon_type = spike[1] >> {{ obfs.spike_shift }};
         atom = spike[1] & {{ obfs.spike_mask }};
-        if (DEBUG) {
-            printf("send spike core=%d, axon=%d, type=%d atom=%d\n",
-                   core_id.id, axon_id, axon_type, atom);
-        }
+#endif
+#if DEBUG
+        printf("send spike core=%d, axon=%d, type=%d atom=%d\n",
+               core_id.id, axon_id, axon_type, atom);
+#endif
         if (axon_type == {{ obfs.axon_type_0 }}) {
             {{ obfs.do_axon_type_0 }}(s->{{ obfs.step }}, core_id, axon_id);
         } else if (axon_type == {{ obfs.axon_type_1 }}) {
@@ -82,25 +123,60 @@ void nengo_io(runState *s) {
     // Communicate with learning snip
     s->{{ obfs.data }}[0] = N_ERRORS;
     error_index = 1;
-    for (int i=0; i < N_ERRORS; i++) {
-        {{ obfs.read }}(in_channel, error_info, ERROR_INFO_SIZE);
-        {{ obfs.read }}(in_channel, error_data, error_info[1]);
-        s->{{ obfs.data }}[error_index] = error_info[0];
-        s->{{ obfs.data }}[error_index + 1] = error_info[1];
-        for (int j=0; j < error_info[1]; j++) {
-            s->{{ obfs.data }}[error_index + ERROR_INFO_SIZE + j] = error_data[j];
+    i_error = 0;
+    j_error = 0;
+    n_errors = -1;
+
+    while (i_error < N_ERRORS || j_error < n_errors) {
+        // read from channel
+        if (buffer_pos + ((n_errors < 0) ? ERROR_INFO_SIZE : 1) > buffer_len) {
+            if (buffer_pos < buffer_len) {
+                // part of the error info remains at end of buffer
+#if ERROR_INFO_SIZE == 2
+                buffer[0] = buffer[buffer_pos];
+                buffer_len = 1;
+#endif
+            } else {
+                buffer_len = 0;
+            }
+
+            // read next packet
+            {{ obfs.read }}(in_channel, &buffer[buffer_len], 1);
+            buffer_len += PACKET_SIZE;
+            buffer_pos = 0;
+        }
+
+        if (n_errors < 0) {
+            // move to next error block
+{% for j in range(obfs.error_info_size) %}
+            s->{{ obfs.data }}[error_index + {{ j }}] = buffer[buffer_pos + {{ j }}];
+{% endfor %}
+            n_errors = buffer[buffer_pos + 1];
+            j_error = 0;
+            error_index += ERROR_INFO_SIZE;
+            buffer_pos += ERROR_INFO_SIZE;
+        } else {
+            // read next error
+            s->{{ obfs.data }}[error_index] = buffer[buffer_pos];
+            j_error++;
+            error_index++;
+            buffer_pos++;
+        }
+
+        if (j_error == n_errors) {
+            i_error++;
+            n_errors = -1;
         }
-        error_index += ERROR_INFO_SIZE + error_info[1];
     }
 
-    output[0] = s->{{ obfs.step }};
+    buffer[0] = s->{{ obfs.step }};
 {% for n_out, core, compartment, key in probes %}
 {% if key == 'u' %}
-    output[{{ n_out }}] = core{{ core }}->{{ obfs.state }}[{{ compartment }}].U;
+    buffer[{{ n_out }}] = core{{ core }}->{{ obfs.state }}[{{ compartment }}].U;
 {% elif key in ('v', 'spike') %}
-    output[{{ n_out }}] = core{{ core }}->{{ obfs.state }}[{{ compartment }}].V;
+    buffer[{{ n_out }}] = core{{ core }}->{{ obfs.state }}[{{ compartment }}].V;
 {% endif %}
 {% endfor %}
 
-    {{ obfs.write }}(out_channel, output, N_OUTPUTS);
+    {{ obfs.write }}(out_channel, buffer, N_OUTPUT_PACKETS);
 }

From af847d3e0ece984a45270e8965c6db670c5dceb5 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Wed, 30 Oct 2019 07:03:13 -0700
Subject: [PATCH 10/24] Assert one block per core with learning

---
 nengo_loihi/hardware/interface.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nengo_loihi/hardware/interface.py b/nengo_loihi/hardware/interface.py
index da0970125..30ed20d93 100644
--- a/nengo_loihi/hardware/interface.py
+++ b/nengo_loihi/hardware/interface.py
@@ -309,7 +309,9 @@ def host2chip(self, spikes, errors):
             for core in self.board.chips[0].cores:
                 for block in core.blocks:
                     if synapse in block.synapses:
-                        # TODO: assumes one block per core
+                        assert (
+                            len(core.blocks) == 1
+                        ), "Learning not implemented with multiple blocks per core"
                         coreid = core.learning_coreid
                         break
 

From 5af088893d30579505b44244b446dece0e473426 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Wed, 23 Oct 2019 12:24:10 -0400
Subject: [PATCH 11/24] TMP: Added test_many_ensembles.py for speed test

---
 test_many_ensembles.py | 53 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 test_many_ensembles.py

diff --git a/test_many_ensembles.py b/test_many_ensembles.py
new file mode 100644
index 000000000..d441a8def
--- /dev/null
+++ b/test_many_ensembles.py
@@ -0,0 +1,53 @@
+import matplotlib.pyplot as plt
+import nengo
+import nengo_loihi
+import numpy as np
+
+
+def input_f(t, phase_offset):
+    return np.sin(6 * t + phase_offset + np.array([0, np.pi / 2]))
+
+
+n_ensembles = 15
+n_neurons = 600
+phase_step = 1  # in radians
+dimensions = len(input_f(0, 0))
+
+
+with nengo.Network(seed=34) as net:
+    nengo_loihi.set_defaults()
+
+    inputs = []
+    ensembles = []
+    probes = []
+    for i in range(n_ensembles):
+
+        def input_f_i(t, i=i):
+            return input_f(t, phase_offset=i * phase_step)
+
+        input = nengo.Node(input_f_i)
+        ensemble = nengo.Ensemble(n_neurons, dimensions=dimensions)
+        probe = nengo.Probe(ensemble, synapse=nengo.synapses.Alpha(0.01))
+        nengo.Connection(input, ensemble, synapse=None)
+
+        inputs.append(input)
+        ensembles.append(ensemble)
+        probes.append(probe)
+
+
+with nengo_loihi.Simulator(net, precompute=False) as sim:
+    sim.run(1.0)
+    #sim.run(10.0)
+    print("Time/step: %0.2f ms" % (sim.timers["snips"] / sim.n_steps * 1000))
+
+plt.figure()
+
+rows = 2
+cols = 2
+
+for i in range(rows * cols):
+    plt.subplot(rows, cols, i + 1)
+    plt.plot(sim.trange(), sim.data[probes[i]])
+
+plt.savefig("test_many_ensembles.png")
+plt.show()

From df56f68718cf18c7b9d2df27c1a9450bd687df44 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <erichuns@gmail.com>
Date: Wed, 30 Oct 2019 15:04:43 -0400
Subject: [PATCH 12/24] fixup! Add host SNIP for faster communication

---
 nengo_loihi/hardware/tests/test_interface.py | 47 --------------------
 nengo_loihi/tests/test_simulator.py          |  1 -
 2 files changed, 48 deletions(-)

diff --git a/nengo_loihi/hardware/tests/test_interface.py b/nengo_loihi/hardware/tests/test_interface.py
index 20355eff4..2228e2ce0 100644
--- a/nengo_loihi/hardware/tests/test_interface.py
+++ b/nengo_loihi/hardware/tests/test_interface.py
@@ -131,50 +131,3 @@ def test_snip_input_count(Simulator, seed, plt):
     with Simulator(model) as sim:
         with pytest.warns(UserWarning, match="Too many spikes"):
             sim.run(0.01)
-
-
-@pytest.mark.target_loihi
-def test_no_hang_on_exception(Simulator, seed):
-    def input_fn(t):
-        if 0.03 < t < 0.05:
-            raise RuntimeError("test_no_hang")
-
-        return t
-
-    with nengo.Network(seed=seed) as net:
-        u = nengo.Node(input_fn)
-        a = nengo.Ensemble(100, 1)
-        nengo.Connection(u, a)
-        nengo.Probe(a)
-
-    with pytest.raises(RuntimeError, match="test_no_hang"):
-        with nengo_loihi.Simulator(net, precompute=False) as sim:
-            sim.run(0.1)
-
-
-@pytest.mark.target_loihi
-def test_multiple_run_calls(Simulator, seed, plt):
-    """Test that we can re-start the SNIPs for a second `run` call"""
-    # NOTE: probe synapses get applied at the end of `run`, so we keep probe
-    # synapses off and do our own filtering.
-
-    f = lambda t: np.sin(8 * np.pi * t)
-
-    with nengo.Network(seed=seed) as net:
-        nengo_loihi.set_defaults()
-
-        u = nengo.Node(f)
-        a = nengo.Ensemble(100, 1)
-        nengo.Connection(u, a, synapse=None)
-        ap = nengo.Probe(a)
-
-    with nengo_loihi.Simulator(net, precompute=False) as sim:
-        sim.run(0.1)
-        sim.run(0.15)
-
-    t = sim.trange()
-    x = f(sim.trange())
-    y = nengo.synapses.Alpha(0.01).filt(sim.data[ap], dt=sim.dt)
-    plt.plot(t, x)
-    plt.plot(t, y)
-    assert signals_allclose(t, x, y, atol=0.07, rtol=0.07, delay=0.024, plt=plt)
diff --git a/nengo_loihi/tests/test_simulator.py b/nengo_loihi/tests/test_simulator.py
index 45a0b43b2..6a90dbb04 100644
--- a/nengo_loihi/tests/test_simulator.py
+++ b/nengo_loihi/tests/test_simulator.py
@@ -452,7 +452,6 @@ def test_interface(Simulator, allclose):
             sim.run(1e-8)
 
 
-@pytest.mark.hang
 @pytest.mark.target_loihi
 def test_loihi_simulation_exception(Simulator):
     """Test that Loihi shuts down properly after exception during simulation"""

From f788346280f6c29450325adecdfd73eebdd078ee Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Mon, 4 Nov 2019 12:30:23 -0800
Subject: [PATCH 13/24] Get core less in learn SNIP

---
 nengo_loihi/hardware/snips/nengo_learn.c.template | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nengo_loihi/hardware/snips/nengo_learn.c.template b/nengo_loihi/hardware/snips/nengo_learn.c.template
index d2038d665..c5db54ce9 100644
--- a/nengo_loihi/hardware/snips/nengo_learn.c.template
+++ b/nengo_loihi/hardware/snips/nengo_learn.c.template
@@ -38,9 +38,11 @@ void nengo_learn(runState *s) {
     for (int error_index=0; error_index < n_errors; error_index++) {
         core = s->{{ obfs.data }} [offset];
         n_vals = s->{{ obfs.data }} [offset+1];
+
+        neuron = {{ obfs.neuron }}(({{ obfs.id_class }}) {.id=core});
+
         for (int i=0; i < n_vals; i++) {
             error = (signed char) s->{{ obfs.data }} [offset+2+i];
-            neuron = {{ obfs.neuron }}(({{ obfs.id_class }}) {.id=core});
             compartment_idx = i;
 
             if (error > 0) {

From 9210d6ae11ceac937ce57718e5eedc0d80e117a5 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Mon, 4 Nov 2019 12:31:12 -0800
Subject: [PATCH 14/24] Reduce extra computation in host2chip_snips

---
 nengo_loihi/hardware/interface.py | 43 +++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/nengo_loihi/hardware/interface.py b/nengo_loihi/hardware/interface.py
index 30ed20d93..1b20e1970 100644
--- a/nengo_loihi/hardware/interface.py
+++ b/nengo_loihi/hardware/interface.py
@@ -77,6 +77,7 @@ def __init__(
         self.host_socket = None  # IO snip superhost (this) <-> host socket
         self.host_socket_connected = False
         self.host_socket_port = None
+        self.error_chip_map = {}  # maps synapses to chip locations for errors
         self._probe_filters = {}
         self._probe_filter_pos = {}
         self._snip_probe_data = collections.OrderedDict()
@@ -303,24 +304,38 @@ def host2chip(self, spikes, errors):
             spike_input = self.nxsdk_board.spike_inputs[spike_input]
             loihi_spikes.extend(spike_input.spikes_to_loihi(t, s))
 
-        loihi_errors = []
+        error_info = []
+        error_vecs = []
         for synapse, t, e in errors:
-            coreid = None
-            for core in self.board.chips[0].cores:
-                for block in core.blocks:
-                    if synapse in block.synapses:
-                        assert (
-                            len(core.blocks) == 1
-                        ), "Learning not implemented with multiple blocks per core"
-                        coreid = core.learning_coreid
+            core_id = self.error_chip_map.get(synapse, None)
+            if core_id is None:
+                for core in self.board.chips[0].cores:
+                    for block in core.blocks:
+                        if synapse in block.synapses:
+                            assert (
+                                len(core.blocks) == 1
+                            ), "Learning not implemented with multiple blocks per core"
+                            core_id = core.learning_coreid
+                            break
+
+                    if core_id is not None:
                         break
 
-                if coreid is not None:
-                    break
+                assert core_id is not None
+                self.error_chip_map[synapse] = core_id
 
-            assert coreid is not None
-            e = scale_pes_errors(e, scale=self.pes_error_scale)
-            loihi_errors.append([coreid, len(e)] + e.tolist())
+            error_info.append([core_id, len(e)])
+            error_vecs.append(e)
+
+        loihi_errors = []
+        if len(error_vecs) > 0:
+            error_vecs = np.concatenate(error_vecs)
+            error_vecs = scale_pes_errors(error_vecs, scale=self.pes_error_scale)
+
+            i = 0
+            for core_id, e_len in error_info:
+                loihi_errors.append([core_id, e_len] + error_vecs[i:i + e_len].tolist())
+                i += e_len
 
         if self.use_snips:
             return self._host2chip_snips(loihi_spikes, loihi_errors)

From 5f430785b22da800fc87f4056aa62735e589b1fb Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Mon, 4 Nov 2019 12:32:19 -0800
Subject: [PATCH 15/24] Only sleep when creating socket connection

---
 nengo_loihi/hardware/interface.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nengo_loihi/hardware/interface.py b/nengo_loihi/hardware/interface.py
index 1b20e1970..b1ef3992b 100644
--- a/nengo_loihi/hardware/interface.py
+++ b/nengo_loihi/hardware/interface.py
@@ -163,11 +163,11 @@ def run_steps(self, steps, blocking=True):
         self.connect()  # returns immediately if already connected
         d_get(self.nxsdk_board, b"cnVu")(steps, **{d(b"YVN5bmM="): not blocking})
 
-        # pause to allow host snip to start and listen for connection
-        time.sleep(0.1)
-
         # connect to host socket
         if self.host_socket is not None and not self.host_socket_connected:
+            # pause to allow host snip to start and listen for connection
+            time.sleep(0.1)
+
             host_address = self.nxsdk_board.executor._host_coordinator.hostAddr
             print(
                 "Connecting to host socket at (%s, %s)"

From af494a6a7bfe0b2bcb52e8a76bb09b5e7b6412a4 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Mon, 4 Nov 2019 12:36:56 -0800
Subject: [PATCH 16/24] Simpler queueing in HostReceiveNode

---
 nengo_loihi/builder/inputs.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/nengo_loihi/builder/inputs.py b/nengo_loihi/builder/inputs.py
index 54a7997fa..a9903d087 100644
--- a/nengo_loihi/builder/inputs.py
+++ b/nengo_loihi/builder/inputs.py
@@ -25,18 +25,14 @@ class HostReceiveNode(Node):
 
     def __init__(self, dimensions, label=Default):
         self.queue = [(0, np.zeros(dimensions))]
-        self.queue_index = 0
         super(HostReceiveNode, self).__init__(
             self.update, size_in=0, size_out=dimensions, label=label
         )
 
     def update(self, t):
-        while (
-            len(self.queue) > self.queue_index + 1
-            and self.queue[self.queue_index][0] < t
-        ):
-            self.queue_index += 1
-        return self.queue[self.queue_index][1]
+        t1, x = self.queue[-1]
+        assert t >= t1
+        return x
 
     def receive(self, t, x):
         self.queue.append((t, x))

From 922c27b8f79cff3cef93024d6052d395f416601a Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Fri, 25 Oct 2019 08:30:32 -0700
Subject: [PATCH 17/24] Flip host2chip and host step order

This allows the Nengo model on the (super)host to be running
simultaneously with the chip, reducing time per step but adding
in a one step delay between the (super)host model and chip model.
---
 nengo_loihi/simulator.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/nengo_loihi/simulator.py b/nengo_loihi/simulator.py
index 2483450d6..467747de7 100644
--- a/nengo_loihi/simulator.py
+++ b/nengo_loihi/simulator.py
@@ -516,10 +516,16 @@ def loihi_precomputed_host_only(steps):
             def loihi_bidirectional_with_host(steps):
                 loihi.run_steps(steps, blocking=False)
                 time0 = timeit.default_timer()
-                for _ in range(steps):
-                    host.step()
+
+                # Run the first host step so there is info to send to the chip,
+                # then run subsequent host steps simultaneously with the chip
+                host.step()
+                for i in range(steps):
                     self._host2chip(loihi)
+                    if i + 1 < steps:
+                        host.step()
                     self._chip2host(loihi)
+
                 time1 = timeit.default_timer()
                 self.timers["snips"] += time1 - time0
 

From 254944eb4f60d3ce11a598e2dcad71cb27b46ea6 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Mon, 4 Nov 2019 13:53:50 -0800
Subject: [PATCH 18/24] Turn off output check on helper nodes if possible

---
 nengo_loihi/builder/inputs.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/nengo_loihi/builder/inputs.py b/nengo_loihi/builder/inputs.py
index a9903d087..7a2e1b6c4 100644
--- a/nengo_loihi/builder/inputs.py
+++ b/nengo_loihi/builder/inputs.py
@@ -9,11 +9,13 @@
 class HostSendNode(Node):
     """For sending host->chip messages"""
 
-    def __init__(self, dimensions, label=Default):
+    def __init__(self, dimensions, label=Default, check_output=False):
         self.queue = []
         super(HostSendNode, self).__init__(
             self.update, size_in=dimensions, size_out=0, label=label
         )
+        if hasattr(self, "check_output"):
+            self.check_output = check_output
 
     def update(self, t, x):
         assert len(self.queue) == 0 or t > self.queue[-1][0]
@@ -23,11 +25,13 @@ def update(self, t, x):
 class HostReceiveNode(Node):
     """For receiving chip->host messages"""
 
-    def __init__(self, dimensions, label=Default):
+    def __init__(self, dimensions, label=Default, check_output=False):
         self.queue = [(0, np.zeros(dimensions))]
         super(HostReceiveNode, self).__init__(
             self.update, size_in=0, size_out=dimensions, label=label
         )
+        if hasattr(self, "check_output"):
+            self.check_output = check_output
 
     def update(self, t):
         t1, x = self.queue[-1]

From cda7a5f4d670dfed638af3e379f350407e91c7a0 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Mon, 4 Nov 2019 13:56:37 -0800
Subject: [PATCH 19/24] Eliminate slow hasattr call in collect receivers

---
 nengo_loihi/builder/inputs.py |  6 ++++++
 nengo_loihi/simulator.py      | 29 ++++++++++++++---------------
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/nengo_loihi/builder/inputs.py b/nengo_loihi/builder/inputs.py
index 7a2e1b6c4..fa41a19bf 100644
--- a/nengo_loihi/builder/inputs.py
+++ b/nengo_loihi/builder/inputs.py
@@ -64,6 +64,9 @@ def receive(self, t, x):
     def update(self, t):
         raise SimulationError("ChipReceiveNodes should not be run")
 
+    def collect_errors(self):
+        return ()
+
     def collect_spikes(self):
         assert self.spike_input is not None
         for t, x in self.spikes:
@@ -96,3 +99,6 @@ def receive(self, t, x):
     def collect_errors(self):
         for t, x in self.errors.items():
             yield (self.target, t, x)
+
+    def collect_spikes(self):
+        return ()
diff --git a/nengo_loihi/simulator.py b/nengo_loihi/simulator.py
index 467747de7..20dea9dbf 100644
--- a/nengo_loihi/simulator.py
+++ b/nengo_loihi/simulator.py
@@ -379,21 +379,20 @@ def _collect_receiver_info(self):
                 receiver.receive(t, x)
             del sender.queue[:]
 
-            if hasattr(receiver, "collect_spikes"):
-                for spike_input, t, spike_idxs in receiver.collect_spikes():
-                    ti = round(t / self.model.dt)
-                    spikes.append((spike_input, ti, spike_idxs))
-            if hasattr(receiver, "collect_errors"):
-                for probe, t, e in receiver.collect_errors():
-                    conn = self.model.probe_conns[probe]
-                    synapse = self.model.objs[conn]["decoders"]
-                    assert synapse.learning
-                    ti = round(t / self.model.dt)
-                    errors_ti = errors.setdefault(ti, OrderedDict())
-                    if synapse in errors_ti:
-                        errors_ti[synapse] += e
-                    else:
-                        errors_ti[synapse] = e.copy()
+            for spike_input, t, spike_idxs in receiver.collect_spikes():
+                ti = round(t / self.model.dt)
+                spikes.append((spike_input, ti, spike_idxs))
+
+            for probe, t, e in receiver.collect_errors():
+                conn = self.model.probe_conns[probe]
+                synapse = self.model.objs[conn]["decoders"]
+                assert synapse.learning
+                ti = round(t / self.model.dt)
+                errors_ti = errors.setdefault(ti, OrderedDict())
+                if synapse in errors_ti:
+                    errors_ti[synapse] += e
+                else:
+                    errors_ti[synapse] = e.copy()
 
         errors = [
             (synapse, ti, e) for ti, ee in errors.items() for synapse, e in ee.items()

From 67c4927fda7b51bbd9223e954a76465c95e2cd88 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Mon, 4 Nov 2019 15:05:27 -0800
Subject: [PATCH 20/24] WIP: Add simplified collect_receiver_info for one step

We're typically calling it one step at a time, in which case we
can simplify it and make it faster.

TODO:
- Fix up original collect_receiver_info and remove profiling code
- The original can probably do the same thing were we get rid of
  the receive and collect functions on the nodes, since essentially
  these just populate and read lists in those nodes.
- Only use the single step function if we know we'll be running
  one step at a time (this might be always right now).
---
 nengo_loihi/builder/inputs.py | 11 +++--
 nengo_loihi/simulator.py      | 81 ++++++++++++++++++++++++++++++++---
 2 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/nengo_loihi/builder/inputs.py b/nengo_loihi/builder/inputs.py
index fa41a19bf..95c2fb714 100644
--- a/nengo_loihi/builder/inputs.py
+++ b/nengo_loihi/builder/inputs.py
@@ -48,6 +48,7 @@ class ChipReceiveNode(Node):
     def __init__(self, dimensions, size_out, label=Default):
         self.raw_dimensions = dimensions
         self.spikes = []
+        self.error_target = None
         self.spike_input = None  # set by builder
         super(ChipReceiveNode, self).__init__(
             self.update, size_in=0, size_out=size_out, label=label
@@ -68,9 +69,7 @@ def collect_errors(self):
         return ()
 
     def collect_spikes(self):
-        assert self.spike_input is not None
-        for t, x in self.spikes:
-            yield (self.spike_input, t, x)
+        return self.spikes
 
 
 class ChipReceiveNeurons(ChipReceiveNode):
@@ -83,8 +82,9 @@ def __init__(self, dimensions, neuron_type=None, label=Default):
 
 class PESModulatoryTarget:
     def __init__(self, target):
-        self.target = target
         self.errors = OrderedDict()
+        self.error_target = target
+        self.spike_input = None
 
     def clear(self):
         self.errors.clear()
@@ -97,8 +97,7 @@ def receive(self, t, x):
             self.errors[t] = np.array(x)
 
     def collect_errors(self):
-        for t, x in self.errors.items():
-            yield (self.target, t, x)
+        return self.errors.items()
 
     def collect_spikes(self):
         return ()
diff --git a/nengo_loihi/simulator.py b/nengo_loihi/simulator.py
index 20dea9dbf..cfc9150fd 100644
--- a/nengo_loihi/simulator.py
+++ b/nengo_loihi/simulator.py
@@ -371,36 +371,103 @@ def step(self):
         self.run_steps(1)
 
     def _collect_receiver_info(self):
+        if not hasattr(self, "_error_synapse_map"):
+            self._error_synapse_map = {}
+            for _, receiver in self.model.host2chip_senders.items():
+                probe = receiver.error_target
+                if probe is None:
+                    continue
+
+                conn = self.model.probe_conns[probe]
+                synapse = self.model.objs[conn]["decoders"]
+                assert synapse.learning
+                if probe in self._error_synapse_map:
+                    assert self._error_synapse_map[probe] is synapse
+                else:
+                    self._error_synapse_map[probe] = synapse
+
+            self.timers["collect_recv"] = 0
+            self.timers["collect_spike"] = 0
+            self.timers["collect_error"] = 0
+            self.timers["collect_end"] = 0
+
         spikes = []
         errors = OrderedDict()
         for sender, receiver in self.model.host2chip_senders.items():
+            timer = timeit.default_timer()
             receiver.clear()
             for t, x in sender.queue:
                 receiver.receive(t, x)
-            del sender.queue[:]
+            sender.queue.clear()
+            self.timers["collect_recv"] += timeit.default_timer() - timer
 
-            for spike_input, t, spike_idxs in receiver.collect_spikes():
+            timer = timeit.default_timer()
+            spike_input = receiver.spike_input
+            for t, spike_idxs in receiver.collect_spikes():
                 ti = round(t / self.model.dt)
                 spikes.append((spike_input, ti, spike_idxs))
+            self.timers["collect_spike"] += timeit.default_timer() - timer
 
-            for probe, t, e in receiver.collect_errors():
-                conn = self.model.probe_conns[probe]
-                synapse = self.model.objs[conn]["decoders"]
-                assert synapse.learning
+            timer = timeit.default_timer()
+            probe = receiver.error_target
+            synapse = self._error_synapse_map.get(probe, None)
+            for t, e in receiver.collect_errors():
                 ti = round(t / self.model.dt)
                 errors_ti = errors.setdefault(ti, OrderedDict())
                 if synapse in errors_ti:
                     errors_ti[synapse] += e
                 else:
                     errors_ti[synapse] = e.copy()
+            self.timers["collect_error"] += timeit.default_timer() - timer
 
+        timer = timeit.default_timer()
         errors = [
             (synapse, ti, e) for ti, ee in errors.items() for synapse, e in ee.items()
         ]
+        self.timers["collect_end"] += timeit.default_timer() - timer
+        return spikes, errors
+
+    def _collect_receiver_info1(self):
+        spikes = []
+        errors = OrderedDict()
+        tf = None
+        ti = None
+        for sender, receiver in self.model.host2chip_senders.items():
+            assert receiver.spike_input is None or receiver.error_target is None
+            assert len(sender.queue) == 1
+            t, x = sender.queue.pop()
+            if tf is None:
+                tf = t
+                ti = round(t / self.model.dt)
+            else:
+                assert t == tf
+
+            spike_target = receiver.spike_input
+            if spike_target is not None:
+                spike_idxs = x.nonzero()[0]
+                spikes.append((spike_target, ti, spike_idxs))
+
+            error_target = receiver.error_target
+            if error_target is not None:
+                conn = self.model.probe_conns[error_target]
+                synapse = self.model.objs[conn]["decoders"]
+                assert synapse.learning
+
+                if synapse in errors:
+                    errors[synapse] += x
+                else:
+                    errors[synapse] = x.copy()
+
+        if len(errors) > 0:
+            assert ti is not None
+            errors = [(synapse, ti, e) for synapse, e in errors.items()]
+        else:
+            errors = []
+
         return spikes, errors
 
     def _host2chip(self, sim):
-        spikes, errors = self._collect_receiver_info()
+        spikes, errors = self._collect_receiver_info1()
         sim.host2chip(spikes, errors)
 
     def _chip2host(self, sim):

From 5e751162ba2eb60cf14068db06389fdce5075d8e Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Tue, 5 Nov 2019 11:41:40 -0800
Subject: [PATCH 21/24] DecodeNeurons.get_ensemble user-friendly

Previously, fixed checking of `neurons_per_dimension` and fixed
value for `add_to_container` made `get_ensemble` not particularly
useful for users trying to make their own `DecodeNeurons`. Now,
these are configurable, and default to the values that users would
likely want.
---
 nengo_loihi/builder/connection.py | 2 +-
 nengo_loihi/decode_neurons.py     | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/nengo_loihi/builder/connection.py b/nengo_loihi/builder/connection.py
index 14d7630cb..2719402dd 100644
--- a/nengo_loihi/builder/connection.py
+++ b/nengo_loihi/builder/connection.py
@@ -185,7 +185,7 @@ def build_host_to_chip(model, conn):
     build_chip_connection(model, receive2post)
 
     logger.debug("Creating DecodeNeuron ensemble for %s", conn)
-    ens = model.node_neurons.get_ensemble(dim)
+    ens = model.node_neurons.get_ensemble(dim, is_input=True, add_to_container=False)
     ens.label = None if conn.label is None else "%s_ens" % conn.label
     _inherit_seed(host, ens, model, conn)
     host.build(ens)
diff --git a/nengo_loihi/decode_neurons.py b/nengo_loihi/decode_neurons.py
index f57c02169..17f2e1a5c 100644
--- a/nengo_loihi/decode_neurons.py
+++ b/nengo_loihi/decode_neurons.py
@@ -167,8 +167,8 @@ def get_block(self, weights, block_label=None, syn_label=None):
 
         return block, syn
 
-    def get_ensemble(self, dim):
-        if self.pairs_per_dim != 1:
+    def get_ensemble(self, dim, is_input=False, **ens_kwargs):
+        if is_input and self.pairs_per_dim != 1:
             # To support this, we need to figure out how to deal with the
             # `post_inds` that map neurons to axons. Either we can do this
             # on the host, in which case we'd have inputs going to the chip
@@ -187,7 +187,7 @@ def get_ensemble(self, dim):
             encoders=encoders,
             gain=self.gain.repeat(dim),
             bias=self.bias.repeat(dim),
-            add_to_container=False,
+            **ens_kwargs
         )
         return ens
 

From 677378403eb906d3c7f58468b2678646d1467a94 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Wed, 6 Nov 2019 07:01:01 -0800
Subject: [PATCH 22/24] WIP: Add parallel_ensembles benchmark

This benchmark has a number of ensembles in parallel on the chip.
It can be used to test chip input and output speeds, and learning.

TODO:
- Uses weird intercepts. Just use normal ones.
- Currently only set up for learning. Allow it to be used without
  learning, and without funnelling on inputs and outputs.
---
 docs/benchmarks/parallel_ensembles.py | 179 ++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 docs/benchmarks/parallel_ensembles.py

diff --git a/docs/benchmarks/parallel_ensembles.py b/docs/benchmarks/parallel_ensembles.py
new file mode 100644
index 000000000..51470bfaa
--- /dev/null
+++ b/docs/benchmarks/parallel_ensembles.py
@@ -0,0 +1,179 @@
+import nengo
+import nengo_loihi
+from nengo_loihi import decode_neurons
+import numpy as np
+import timeit
+import traceback
+
+from utils import AreaIntercepts, Triangular
+
+rng = np.random.RandomState(1)
+
+#n_input = 6
+#n_output = 3
+n_input = 10
+n_output = 5
+
+#n_neurons = 10
+n_neurons = 400
+#n_neurons = 1000
+#n_ensembles = 1
+n_ensembles = 20
+#pes_learning_rate = 5e-9
+pes_learning_rate = 5e-6
+#pes_learning_rate = 5e-4
+seed = 0
+
+# synapse time constants
+tau_input = 0.01  # on input connection
+tau_training = 0.01  # on the training signal
+#tau_output = 0.2  # on the output from the adaptive ensemble
+tau_output = 0.01  # on the output from the adaptive ensemble
+# NOTE: the time constant on the neural activity used in the learning
+# connection is the default 0.005, and can be set by specifying the
+# pre_synapse parameter inside the PES rule instantiation
+
+# set up neuron intercepts
+intercepts_bounds = [-0.3, 0.1]
+intercepts_mode = 0.1
+
+intercepts_dist = AreaIntercepts(
+    dimensions=n_input,
+    base=Triangular(intercepts_bounds[0], intercepts_mode, intercepts_bounds[1]),
+)
+intercepts = intercepts_dist.sample(n=n_neurons * n_ensembles, rng=rng)
+intercepts = intercepts.reshape(n_ensembles, n_neurons)
+
+np.random.seed = seed
+encoders_dist = nengo.dists.UniformHypersphere(surface=True)
+encoders = encoders_dist.sample(n_neurons * n_ensembles, n_input, rng=rng)
+encoders = encoders.reshape(n_ensembles, n_neurons, n_input)
+
+nengo_model = nengo.Network(seed=seed)
+nengo_model.config[nengo.Ensemble].neuron_type = nengo.LIF()
+# nengo_model.config[nengo.Connection].synapse = None
+
+with nengo_model:
+    error_rng = np.random.RandomState(5)
+
+    assert n_input % n_output == 0
+    inout_transform = np.repeat(np.eye(n_output), n_input / n_output, axis=1)
+    inout_transform /= inout_transform.sum(axis=1, keepdims=True)
+
+    #inout_transform = error_rng.uniform(-1, 1, size=(n_output, n_input))
+    #inout_transform /= np.sqrt((inout_transform**2).sum(axis=1, keepdims=True))
+
+    def arm_func(t, u_adapt):
+        freq = np.pi
+        phases = (1 / n_input) * np.arange(n_input)
+        inputs = np.sin(freq*t + 2*np.pi*phases)
+
+        target_output = np.dot(inout_transform, inputs)
+        errors = u_adapt - target_output
+
+        if int(1000 * t) % 1000 == 0:
+            print(("i", inputs))
+            print(("u", u_adapt))
+            print(("o", target_output))
+            print(("e", errors))
+
+        return inputs.tolist() + errors.tolist()
+
+    arm = nengo.Node(arm_func, size_in=n_output, size_out=n_input+n_output, label="arm")
+    arm_probe = nengo.Probe(arm)
+
+    input_decodeneurons = decode_neurons.Preset10DecodeNeurons()
+    onchip_input = input_decodeneurons.get_ensemble(dim=n_input)
+    nengo.Connection(arm[:n_input], onchip_input, synapse=None)
+    inp2ens_transform = np.hstack(
+        [np.eye(n_input), -np.eye(n_input)] * input_decodeneurons.pairs_per_dim
+    )
+
+    output_decodeneurons = decode_neurons.Preset10DecodeNeurons()
+    onchip_output = output_decodeneurons.get_ensemble(dim=n_output)
+    out2arm_transform = np.hstack(
+        [np.eye(n_output), -np.eye(n_output)] * output_decodeneurons.pairs_per_dim
+    ) / 2000.
+    nengo.Connection(onchip_output.neurons, arm, transform=out2arm_transform, synapse=tau_output)
+
+    adapt_ens = []
+    conn_learn = []
+    for ii in range(n_ensembles):
+        adapt_ens.append(
+            nengo.Ensemble(
+                n_neurons=n_neurons,
+                dimensions=n_input,
+                intercepts=intercepts[ii],
+                radius=np.sqrt(n_input),
+                encoders=encoders[ii],
+                label="ens%02d" % ii,
+            )
+        )
+
+        # hook up input signal to adaptive population to provide context
+        inp2ens_transform_ii = np.dot(encoders[ii], inp2ens_transform)
+        nengo.Connection(onchip_input.neurons, adapt_ens[ii].neurons,
+                         transform=inp2ens_transform_ii, synapse=tau_input)
+
+        conn_learn.append(
+            nengo.Connection(
+                adapt_ens[ii],
+                onchip_output,
+                learning_rule_type=nengo.PES(
+                    pes_learning_rate, pre_synapse=tau_training,
+                ),
+                #transform=inout_transform / n_ensembles,
+                #transform=inout_transform / n_ensembles * 0.1,
+                transform=rng.uniform(-0.01, 0.01, size=(n_output, n_input)),
+                #transform=np.zeros((n_output, n_input)),
+                #transform=np.zeros((n_output, n_neurons)),
+                #transform=np.zeros((onchip_output.n_neurons, n_neurons)),
+            )
+        )
+
+        # hook up the training signal to the learning rule
+        nengo.Connection(
+            arm[n_input:], conn_learn[ii].learning_rule,
+            synapse=None,
+            #synapse=tau_training
+        )
+
+import cProfile
+
+with nengo_loihi.Simulator(
+        nengo_model,
+        target='loihi',
+        hardware_options=dict(snip_max_spikes_per_step=300)) as sim:
+# with nengo.Simulator(nengo_model) as sim:
+
+    hw = sim.sims["loihi"]
+    print("n_errors: %s" % hw.nengo_io_h2c_errors)
+    print("n_outputs: %s" % hw.nengo_io_c2h_count)
+
+    sim.run(0.001)
+    start = timeit.default_timer()
+    #sim.run(0.005)
+    #sim.run(1)
+    #sim.run(4)
+    sim.run(10)
+    #sim.run(30)
+    #cProfile.runctx("sim.run(1)", globals(), locals(), sort="cumtime")
+    #cProfile.runctx("sim.run(0.001)", globals(), locals(), sort="cumtime")
+    print('Run time: %0.5f' % (timeit.default_timer() - start))
+    print('Run time/step: %0.2f ms' % (1000 * sim.timers["snips"] / sim.n_steps))
+
+
+    inputs = sim.data[arm_probe][:, :n_input]
+    targets = np.dot(inputs, inout_transform.T)
+    errors = sim.data[arm_probe][:, n_input:]
+    outputs = errors + targets
+
+    error_steps = min(sim.n_steps, 2000)
+    start_error = np.abs(errors[:error_steps]).mean()
+    end_error = np.abs(errors[-error_steps:]).mean()
+    print("Error: start %0.3f, end %0.3f" % (start_error, end_error))
+
+import matplotlib.pyplot as plt
+plt.plot(sim.trange(), targets, ":")
+plt.plot(sim.trange(), outputs)
+plt.savefig("dynamics_adaptation_test.pdf")

From b2deba76a8865fc08f55614b5695ea214a9f31b3 Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Wed, 6 Nov 2019 08:23:38 -0800
Subject: [PATCH 23/24] squash! WIP: Add parallel_ensembles benchmark

- Remove weird intercept distribution
- Allow input and output funnelling and learning to be configured
---
 docs/benchmarks/parallel_ensembles.py | 241 ++++++++++++--------------
 1 file changed, 115 insertions(+), 126 deletions(-)

diff --git a/docs/benchmarks/parallel_ensembles.py b/docs/benchmarks/parallel_ensembles.py
index 51470bfaa..a7494cd83 100644
--- a/docs/benchmarks/parallel_ensembles.py
+++ b/docs/benchmarks/parallel_ensembles.py
@@ -1,13 +1,30 @@
+import cProfile
+import timeit
+
+import matplotlib.pyplot as plt
 import nengo
 import nengo_loihi
 from nengo_loihi import decode_neurons
 import numpy as np
-import timeit
-import traceback
-
-from utils import AreaIntercepts, Triangular
 
 rng = np.random.RandomState(1)
+seed = rng.randint(0, 2**31)
+
+LoihiEmulator = lambda net: nengo_loihi.Simulator(net, target="sim")
+LoihiSimulator = lambda net: nengo_loihi.Simulator(
+    net, target="loihi", hardware_options=dict(snip_max_spikes_per_step=300),
+)
+NengoSimulator = lambda net: nengo.Simulator(net)
+
+Simulator = LoihiSimulator
+
+funnel_input = True
+funnel_output = True
+learning = True
+profile = False
+#profile = True
+
+simtime = 10
 
 #n_input = 6
 #n_output = 3
@@ -22,150 +39,123 @@
 #pes_learning_rate = 5e-9
 pes_learning_rate = 5e-6
 #pes_learning_rate = 5e-4
-seed = 0
 
 # synapse time constants
-tau_input = 0.01  # on input connection
-tau_training = 0.01  # on the training signal
-#tau_output = 0.2  # on the output from the adaptive ensemble
-tau_output = 0.01  # on the output from the adaptive ensemble
-# NOTE: the time constant on the neural activity used in the learning
-# connection is the default 0.005, and can be set by specifying the
-# pre_synapse parameter inside the PES rule instantiation
-
-# set up neuron intercepts
-intercepts_bounds = [-0.3, 0.1]
-intercepts_mode = 0.1
-
-intercepts_dist = AreaIntercepts(
-    dimensions=n_input,
-    base=Triangular(intercepts_bounds[0], intercepts_mode, intercepts_bounds[1]),
-)
-intercepts = intercepts_dist.sample(n=n_neurons * n_ensembles, rng=rng)
-intercepts = intercepts.reshape(n_ensembles, n_neurons)
+tau_input = 0.01
+tau_error = 0.01
+tau_output = 0.01
 
-np.random.seed = seed
 encoders_dist = nengo.dists.UniformHypersphere(surface=True)
 encoders = encoders_dist.sample(n_neurons * n_ensembles, n_input, rng=rng)
 encoders = encoders.reshape(n_ensembles, n_neurons, n_input)
 
-nengo_model = nengo.Network(seed=seed)
-nengo_model.config[nengo.Ensemble].neuron_type = nengo.LIF()
-# nengo_model.config[nengo.Connection].synapse = None
+input_freq = np.pi
+input_phases = (1 / n_input) * np.arange(n_input)
 
-with nengo_model:
-    error_rng = np.random.RandomState(5)
+# desired transform between input and output
+assert n_input % n_output == 0
+inout_transform = np.repeat(np.eye(n_output), n_input / n_output, axis=1)
+inout_transform /= inout_transform.sum(axis=1, keepdims=True)
 
-    assert n_input % n_output == 0
-    inout_transform = np.repeat(np.eye(n_output), n_input / n_output, axis=1)
-    inout_transform /= inout_transform.sum(axis=1, keepdims=True)
+net = nengo.Network(seed=seed)
+net.config[nengo.Ensemble].neuron_type = nengo.LIF()
+net.config[nengo.Ensemble].intercepts = nengo.dists.Uniform(-1, 0.5)
+net.config[nengo.Ensemble].max_rates = nengo.dists.Uniform(100, 200)
+# net.config[nengo.Connection].synapse = None
 
-    #inout_transform = error_rng.uniform(-1, 1, size=(n_output, n_input))
-    #inout_transform /= np.sqrt((inout_transform**2).sum(axis=1, keepdims=True))
+with net:
+    error_rng = np.random.RandomState(5)
 
-    def arm_func(t, u_adapt):
-        freq = np.pi
-        phases = (1 / n_input) * np.arange(n_input)
-        inputs = np.sin(freq*t + 2*np.pi*phases)
+    def ctrl_func(t, u_adapt):
+        inputs = np.sin(input_freq*t + 2*np.pi*input_phases)
 
         target_output = np.dot(inout_transform, inputs)
         errors = u_adapt - target_output
 
-        if int(1000 * t) % 1000 == 0:
-            print(("i", inputs))
-            print(("u", u_adapt))
-            print(("o", target_output))
-            print(("e", errors))
-
         return inputs.tolist() + errors.tolist()
 
-    arm = nengo.Node(arm_func, size_in=n_output, size_out=n_input+n_output, label="arm")
-    arm_probe = nengo.Probe(arm)
-
-    input_decodeneurons = decode_neurons.Preset10DecodeNeurons()
-    onchip_input = input_decodeneurons.get_ensemble(dim=n_input)
-    nengo.Connection(arm[:n_input], onchip_input, synapse=None)
-    inp2ens_transform = np.hstack(
-        [np.eye(n_input), -np.eye(n_input)] * input_decodeneurons.pairs_per_dim
-    )
-
-    output_decodeneurons = decode_neurons.Preset10DecodeNeurons()
-    onchip_output = output_decodeneurons.get_ensemble(dim=n_output)
-    out2arm_transform = np.hstack(
-        [np.eye(n_output), -np.eye(n_output)] * output_decodeneurons.pairs_per_dim
-    ) / 2000.
-    nengo.Connection(onchip_output.neurons, arm, transform=out2arm_transform, synapse=tau_output)
-
-    adapt_ens = []
-    conn_learn = []
-    for ii in range(n_ensembles):
-        adapt_ens.append(
-            nengo.Ensemble(
-                n_neurons=n_neurons,
-                dimensions=n_input,
-                intercepts=intercepts[ii],
-                radius=np.sqrt(n_input),
-                encoders=encoders[ii],
-                label="ens%02d" % ii,
-            )
+    ctrl = nengo.Node(ctrl_func, size_in=n_output, size_out=n_input+n_output, label="ctrl")
+    ctrl_probe = nengo.Probe(ctrl)
+
+    input = ctrl[:n_input]
+    error = ctrl[n_input:]
+    output = ctrl
+
+    inp2ens_transform = None
+    if funnel_input:
+        input_decodeneurons = decode_neurons.Preset10DecodeNeurons()
+        onchip_input = input_decodeneurons.get_ensemble(dim=n_input)
+        nengo.Connection(input, onchip_input, synapse=None)
+        inp2ens_transform = np.hstack(
+            [np.eye(n_input), -np.eye(n_input)] * input_decodeneurons.pairs_per_dim
+        )
+        input = onchip_input
+
+    if funnel_output:
+        output_decodeneurons = decode_neurons.Preset10DecodeNeurons()
+        onchip_output = output_decodeneurons.get_ensemble(dim=n_output)
+        out2ctrl_transform = np.hstack(
+            [np.eye(n_output), -np.eye(n_output)] * output_decodeneurons.pairs_per_dim
+        ) / 2000.
+        nengo.Connection(
+            onchip_output.neurons,
+            output,
+            transform=out2ctrl_transform,
+            synapse=tau_output,
         )
+        output = onchip_output
 
-        # hook up input signal to adaptive population to provide context
-        inp2ens_transform_ii = np.dot(encoders[ii], inp2ens_transform)
-        nengo.Connection(onchip_input.neurons, adapt_ens[ii].neurons,
-                         transform=inp2ens_transform_ii, synapse=tau_input)
+    for ii in range(n_ensembles):
+        ens = nengo.Ensemble(
+            n_neurons=n_neurons,
+            dimensions=n_input,
+            radius=np.sqrt(n_input),
+            encoders=encoders[ii],
+            label="ens%02d" % ii,
+        )
 
-        conn_learn.append(
+        if inp2ens_transform is not None:
+            inp2ens_transform_ii = np.dot(encoders[ii], inp2ens_transform)
             nengo.Connection(
-                adapt_ens[ii],
-                onchip_output,
-                learning_rule_type=nengo.PES(
-                    pes_learning_rate, pre_synapse=tau_training,
-                ),
-                #transform=inout_transform / n_ensembles,
-                #transform=inout_transform / n_ensembles * 0.1,
-                transform=rng.uniform(-0.01, 0.01, size=(n_output, n_input)),
-                #transform=np.zeros((n_output, n_input)),
-                #transform=np.zeros((n_output, n_neurons)),
-                #transform=np.zeros((onchip_output.n_neurons, n_neurons)),
+                input.neurons,
+                ens.neurons,
+                transform=inp2ens_transform_ii,
+                synapse=tau_input
             )
-        )
+        else:
+            nengo.Connection(input, ens, synapse=tau_input)
+
+        conn_kwargs = dict()
+        if learning:
+            conn_kwargs["transform"] = rng.uniform(-0.01, 0.01, size=(n_output, n_input))
+            conn_kwargs["learning_rule_type"] = nengo.PES(
+                pes_learning_rate, pre_synapse=tau_error,
+            )
+        else:
+            conn_kwargs["transform"] = inout_transform / n_ensembles
 
-        # hook up the training signal to the learning rule
-        nengo.Connection(
-            arm[n_input:], conn_learn[ii].learning_rule,
-            synapse=None,
-            #synapse=tau_training
-        )
+        conn = nengo.Connection(ens, output, **conn_kwargs)
 
-import cProfile
+        if learning:
+            nengo.Connection(error, conn.learning_rule, synapse=None)
 
-with nengo_loihi.Simulator(
-        nengo_model,
-        target='loihi',
-        hardware_options=dict(snip_max_spikes_per_step=300)) as sim:
-# with nengo.Simulator(nengo_model) as sim:
-
-    hw = sim.sims["loihi"]
-    print("n_errors: %s" % hw.nengo_io_h2c_errors)
-    print("n_outputs: %s" % hw.nengo_io_c2h_count)
-
-    sim.run(0.001)
-    start = timeit.default_timer()
-    #sim.run(0.005)
-    #sim.run(1)
-    #sim.run(4)
-    sim.run(10)
-    #sim.run(30)
-    #cProfile.runctx("sim.run(1)", globals(), locals(), sort="cumtime")
-    #cProfile.runctx("sim.run(0.001)", globals(), locals(), sort="cumtime")
-    print('Run time: %0.5f' % (timeit.default_timer() - start))
-    print('Run time/step: %0.2f ms' % (1000 * sim.timers["snips"] / sim.n_steps))
-
-
-    inputs = sim.data[arm_probe][:, :n_input]
+
+with Simulator(net) as sim:
+    sim.run(0.001)  # eliminate any extra startup from timing
+
+    steps = sim.n_steps
+    timer = timeit.default_timer()
+    if profile:
+        cProfile.runctx("sim.run(%s)" % simtime, globals(), locals(), sort="cumtime")
+    else:
+        sim.run(simtime)
+    timer = timeit.default_timer() - timer
+    steps = sim.n_steps - steps
+    print('Run time/step: %0.2f ms' % (1000 * timer / steps))
+
+    inputs = sim.data[ctrl_probe][:, :n_input]
     targets = np.dot(inputs, inout_transform.T)
-    errors = sim.data[arm_probe][:, n_input:]
+    errors = sim.data[ctrl_probe][:, n_input:]
     outputs = errors + targets
 
     error_steps = min(sim.n_steps, 2000)
@@ -173,7 +163,6 @@ def arm_func(t, u_adapt):
     end_error = np.abs(errors[-error_steps:]).mean()
     print("Error: start %0.3f, end %0.3f" % (start_error, end_error))
 
-import matplotlib.pyplot as plt
-plt.plot(sim.trange(), targets, ":")
-plt.plot(sim.trange(), outputs)
-plt.savefig("dynamics_adaptation_test.pdf")
+    plt.plot(sim.trange(), targets, ":")
+    plt.plot(sim.trange(), outputs)
+    plt.savefig("parallel_ensembles.pdf")

From 836e9d241741877abaf8ff60791c7ff2337c95be Mon Sep 17 00:00:00 2001
From: Eric Hunsberger <eric.hunsberger@appliedbrainresearch.com>
Date: Wed, 6 Nov 2019 08:41:25 -0800
Subject: [PATCH 24/24] BUG: Running parallel_ensemble.py results in invalid
 axon_type bug

---
 docs/benchmarks/parallel_ensembles.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/benchmarks/parallel_ensembles.py b/docs/benchmarks/parallel_ensembles.py
index a7494cd83..78a49e8df 100644
--- a/docs/benchmarks/parallel_ensembles.py
+++ b/docs/benchmarks/parallel_ensembles.py
@@ -18,9 +18,9 @@
 
 Simulator = LoihiSimulator
 
-funnel_input = True
+funnel_input = False
 funnel_output = True
-learning = True
+learning = False
 profile = False
 #profile = True
 
@@ -32,10 +32,14 @@
 n_output = 5
 
 #n_neurons = 10
+#n_neurons = 100
 n_neurons = 400
 #n_neurons = 1000
+
 #n_ensembles = 1
+#n_ensembles = 3
 n_ensembles = 20
+
 #pes_learning_rate = 5e-9
 pes_learning_rate = 5e-6
 #pes_learning_rate = 5e-4