From 47e7c6a8f6b8eafb3cda4aa9b4c220c954b6feaa Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Fri, 23 Aug 2024 16:35:25 -0500
Subject: [PATCH] safety commit

---
 my_pyscf/lassi/op_o1/frag.py      |  71 ++++++++++--
 my_pyscf/lassi/op_o1/hams2ovlp.py |  10 +-
 my_pyscf/lassi/op_o1/hci.py       | 178 +++++++++++++++++++++++++++---
 my_pyscf/lassi/op_o1/rdm.py       |  16 +--
 my_pyscf/lassi/op_o1/stdm.py      |  21 ++--
 5 files changed, 248 insertions(+), 48 deletions(-)

diff --git a/my_pyscf/lassi/op_o1/frag.py b/my_pyscf/lassi/op_o1/frag.py
index b1bba6ce..039f596f 100644
--- a/my_pyscf/lassi/op_o1/frag.py
+++ b/my_pyscf/lassi/op_o1/frag.py
@@ -1,15 +1,16 @@
 import numpy as np
 from scipy import linalg
 from pyscf import lib
-from pyscf.fci.direct_spin1 import trans_rdm12s, contract_1e
+from pyscf.fci.direct_spin1 import trans_rdm12s, contract_1e, contract_2e, absorb_h1e
+from pyscf.fci.direct_uhf import contract_1e as contract_1e_uhf
 from pyscf.fci.addons import cre_a, cre_b, des_a, des_b
 from pyscf.fci import cistring
 from itertools import product, combinations
 from mrh.my_pyscf.lassi.citools import get_lroots, get_rootaddr_fragaddr
 from mrh.my_pyscf.lassi.op_o1.utilities import *
 
-class LSTDMint1 (object):
-    ''' LAS state transition density matrix intermediate 1: fragment-local data.
+class FragTDMInt (object):
+    ''' Fragment-local LAS state transition density matrix intermediate
 
         Quasi-sparse-memory storage for LAS-state transition density matrix single-fragment
         intermediates. Stores all local transition density matrix factors. Run the `kernel` method
@@ -465,7 +466,17 @@ def trans_rdm12s_loop (iroot, bra, ket):
         return t0
 
     def contract_h00 (self, h_00, h_11, h_22, ket):
-        raise NotImplementedError
+        r = self.rootaddr[ket]
+        n = self.fragaddr[ket]
+        norb, nelec = self.norb, self.nelec_r[r]
+        ci = self.ci[r][n]
+        h_uhf = (h_11[0] - h_11[1]) / 2
+        h_uhf = [h_uhf, -h.uhf]
+        h_11 = h_11.sum (0) / 2
+        h2eff = absorb_h1e (h_11, h_22, norb, nelec, 0.5)
+        hci = h_00*ci + contract_2e (h2eff, ci, norb, nelec)
+        hci += contract_uhf (h_uhf, ci, norb, nelec)
+        return hci
 
     def contract_h10 (self, spin, h_10, h_21, ket):
         r = self.rootaddr[ket]
@@ -497,16 +508,52 @@ def contract_h01 (self, spin, h_01, h_12, ket):
         return hci
 
     def contract_h20 (self, spin, h_20, ket):
-        raise NotImplementedError
+        r = self.rootaddr[ket]
+        n = self.fragaddr[ket]
+        norb, nelec = self.norb, self.nelec_r[r]
+        ci = self.ci[r][n]
+        # 0, 1, 2 = aa, ab, bb
+        cre_op1 = (cre_a, cre_b)[int (spin>1)]
+        cre_op2 = (cre_a, cre_b)[int (spin>0)]
+        hci = 0
+        for q in range (self.norb):
+            qci = cre_op2 (ci, norb, nelec, q)
+            for p in range (self.norb):
+                hci += h_20[p,q] * cre_op1 (qci, norb, nelec, p)
+        return hci
 
     def contract_h02 (self, spin, h_02, ket):
-        raise NotImplementedError
+        r = self.rootaddr[ket]
+        n = self.fragaddr[ket]
+        norb, nelec = self.norb, self.nelec_r[r]
+        ci = self.ci[r][n]
+        # 0, 1, 2 = aa, ab, bb
+        des_op1 = (des_a, des_b)[int (spin>1)]
+        des_op2 = (des_a, des_b)[int (spin>0)]
+        hci = 0
+        for q in range (self.norb):
+            qci = des_op1 (ci, norb, nelec, q)
+            for p in range (self.norb):
+                hci += h_02[p,q] * des_op2 (qci, norb, nelec, p)
+        return hci
 
     def contract_h11 (self, spin, h_11, ket):
-        raise NotImplementedError
+        r = self.rootaddr[ket]
+        n = self.fragaddr[ket]
+        norb, nelec = self.norb, self.nelec_r[r]
+        ci = self.ci[r][n]
+        # 0, 1 = ab, ba
+        cre_op = (cre_a, cre_b)[spin]
+        des_op = (des_b, des_a)[spin]
+        hci = 0
+        for q in range (self.norb):
+            qci = des_op (ci, norb, nelec, q)
+            for p in range (self.norb):
+                hci += h_11[p,q] * cre_op (qci, norb, nelec, p)
+        return hci
 
-def make_ints (las, ci, nelec_frs, screen_linequiv=True, _LSTDMint1_class=LSTDMint1):
-    ''' Build fragment-local intermediates (`LSTDMint1`) for LASSI o1
+def make_ints (las, ci, nelec_frs, screen_linequiv=True, nlas=None, _FragTDMInt_class=FragTDMInt):
+    ''' Build fragment-local intermediates (`FragTDMInt`) for LASSI o1
 
     Args:
         las : instance of :class:`LASCINoSymm`
@@ -525,18 +572,18 @@ def make_ints (las, ci, nelec_frs, screen_linequiv=True, _LSTDMint1_class=LSTDMi
         hopping_index : ndarray of ints of shape (nfrags, 2, nroots, nroots)
             element [i,j,k,l] reports the change of number of electrons of
             spin j in fragment i between LAS rootspaces k and l
-        ints : list of length nfrags of instances of :class:`LSTDMint1`
+        ints : list of length nfrags of instances of :class:`FragTDMInt`
         lroots: ndarray of ints of shape (nfrags, nroots)
             Number of states within each fragment and rootspace
     '''
     nfrags, nroots = nelec_frs.shape[:2]
-    nlas = las.ncas_sub
+    if nlas is None: nlas = las.ncas_sub
     lroots = get_lroots (ci)
     hopping_index, zerop_index, onep_index = lst_hopping_index (nelec_frs)
     rootaddr, fragaddr = get_rootaddr_fragaddr (lroots)
     ints = []
     for ifrag in range (nfrags):
-        tdmint = _LSTDMint1_class (ci[ifrag], hopping_index[ifrag], zerop_index, onep_index,
+        tdmint = _FragTDMInt_class (ci[ifrag], hopping_index[ifrag], zerop_index, onep_index,
                                    nlas[ifrag], nroots, nelec_frs[ifrag], rootaddr,
                                    fragaddr[ifrag], ifrag, screen_linequiv=screen_linequiv)
         lib.logger.timer (las, 'LAS-state TDM12s fragment {} intermediate crunching'.format (
diff --git a/my_pyscf/lassi/op_o1/hams2ovlp.py b/my_pyscf/lassi/op_o1/hams2ovlp.py
index 554d4504..45d6931f 100644
--- a/my_pyscf/lassi/op_o1/hams2ovlp.py
+++ b/my_pyscf/lassi/op_o1/hams2ovlp.py
@@ -10,8 +10,8 @@
 # for two fragments this is (sign?)
 # ((d1a_pp - d1b_pp) * (d1a_qq - d1b_qq))/4 - (sp_pp*sm_qq + sm_pp*sp_qq)/2
 
-class HamS2ovlpint (stdm.LSTDMint2):
-    __doc__ = stdm.LSTDMint2.__doc__ + '''
+class HamS2Ovlp (stdm.LSTDM):
+    __doc__ = stdm.LSTDM.__doc__ + '''
 
     SUBCLASS: Hamiltonian, spin-squared, and overlap matrices
 
@@ -29,7 +29,7 @@ class HamS2ovlpint (stdm.LSTDMint2):
 
     def __init__(self, ints, nlas, hopping_index, lroots, h1, h2, mask_bra_space=None,
                  mask_ket_space=None, log=None, max_memory=2000, dtype=np.float64):
-        stdm.LSTDMint2.__init__(self, ints, nlas, hopping_index, lroots,
+        stdm.LSTDM.__init__(self, ints, nlas, hopping_index, lroots,
                                 mask_bra_space=mask_bra_space, mask_ket_space=mask_ket_space,
                                 log=log, max_memory=max_memory, dtype=dtype)
         if h1.ndim==2: h1 = np.stack ([h1,h1], axis=0)
@@ -397,7 +397,7 @@ def _crunch_2c_(self, bra, ket, i, j, k, l, s2lt):
         self.dt_2c, self.dw_2c = self.dt_2c + dt, self.dw_2c + dw
         return ham, s2, (l, j, i, k)
 
-def ham (las, h1, h2, ci, nelec_frs, _HamS2ovlpint_class=HamS2ovlpint, **kwargs):
+def ham (las, h1, h2, ci, nelec_frs, _HamS2Ovlp_class=HamS2Ovlp, **kwargs):
     ''' Build Hamiltonian, spin-squared, and overlap matrices in LAS product state basis
 
     Args:
@@ -438,7 +438,7 @@ def ham (las, h1, h2, ci, nelec_frs, _HamS2ovlpint_class=HamS2ovlpint, **kwargs)
 
     # Second pass: upper-triangle
     t0 = (lib.logger.process_clock (), lib.logger.perf_counter ())
-    outerprod = _HamS2ovlpint_class (ints, nlas, hopping_index, lroots, h1, h2, dtype=dtype,
+    outerprod = _HamS2Ovlp_class (ints, nlas, hopping_index, lroots, h1, h2, dtype=dtype,
                                      max_memory=max_memory, log=log)
     lib.logger.timer (las, 'LASSI Hamiltonian second intermediate indexing setup', *t0)
     ham, s2, ovlp, t0 = outerprod.kernel ()
diff --git a/my_pyscf/lassi/op_o1/hci.py b/my_pyscf/lassi/op_o1/hci.py
index 51575e59..1c0e52b6 100644
--- a/my_pyscf/lassi/op_o1/hci.py
+++ b/my_pyscf/lassi/op_o1/hci.py
@@ -4,9 +4,10 @@
 from pyscf.fci import cistring 
 from mrh.my_pyscf.lassi.op_o1 import stdm, frag, hams2ovlp
 from mrh.my_pyscf.lassi.op_o1.utilities import *
+from mrh.my_pyscf.lassi.citools import get_lroots
 
-class ContractHamCI (stdm.LSTDMint2):
-    __doc__ = stdm.LSTDMint2.__doc__ + '''
+class ContractHamCI (stdm.LSTDM):
+    __doc__ = stdm.LSTDM.__doc__ + '''
 
     SUBCLASS: Contract Hamiltonian on CI vectors and integrate over all but one fragment,
     for all fragments.
@@ -23,7 +24,7 @@ def __init__(self, ints, nlas, hopping_index, lroots, h1, h2, nbra=1,
         nfrags, _, nroots, _ = hopping_index.shape
         if nfrags > 2: raise NotImplementedError ("Spectator fragments in _crunch_1c_")
         nket = nroots - nbra
-        hams2ovlp.HamS2ovlpint.__init__(self, ints, nlas, hopping_index, lroots, h1, h2,
+        hams2ovlp.HamS2Ovlp.__init__(self, ints, nlas, hopping_index, lroots, h1, h2,
                                         mask_bra_space = list (range (nket, nroots)),
                                         mask_ket_space = list (range (nket)),
                                         log=log, max_memory=max_memory, dtype=dtype)
@@ -49,6 +50,22 @@ def _init_vecs (self):
             hci_fr_pabq.append (hci_r_pabq)
         return hci_fr_pabq
 
+    def _crunch_1d_(self, bra, ket, i):
+        '''Compute a single-fragment density fluctuation, for both the 1- and 2-RDMs.'''
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
+        raise NotImplementedError
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_1d, self.dw_1d = self.dt_1d + dt, self.dw_1d + dw
+        return
+
+    def _crunch_2d_(self, bra, ket, i, j):
+        '''Compute a two-fragment density fluctuation.'''
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
+        raise NotImplementedError
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_2d, self.dw_2d = self.dt_2d + dt, self.dw_2d + dw
+        return
+ 
     def _crunch_1c_(self, bra, ket, i, j, s1):
         '''Perform a single electron hop; i.e.,
         
@@ -58,9 +75,9 @@ def _crunch_1c_(self, bra, ket, i, j, s1):
         
         j ---s1---> i
         '''
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
         hci_f_ab, excfrags = self._get_vecs_(bra, ket)
         excfrags = excfrags.intersection ({i, j})
-        if self.nfrags > 2: raise NotImplementedError ("Spectator fragments in _crunch_1c_")
         if not len (excfrags): return
         p, q = self.get_range (i)
         r, s = self.get_range (j)
@@ -86,17 +103,87 @@ def _crunch_1c_(self, bra, ket, i, j, s1):
                 axes=((0,1,2),(2,0,1)))
             h_12 = np.tensordot (D_i, h2_ijjj, axes=1).transpose (1,2,0)
             hci_f_ab[j] += fac * self.ints[j].contract_h01 (s1, h_01, h_12, ket)
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_1c, self.dw_1c = self.dt_1c + dt, self.dw_1c + dw
         self._put_vecs_(bra, ket, hci_f_ab, i, j)
         return
 
+    def _crunch_1c1d_(self, bra, ket, i, j, k, s1):
+        '''Compute the reduced density matrix elements of a coupled electron-hop and
+        density fluctuation.'''
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
+        raise NotImplementedError
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_1c1d, self.dw_1c1d = self.dt_1c1d + dt, self.dw_1c1d + dw
+        return
+
     def _crunch_1s_(self, bra, ket, i, j):
+        '''Compute the reduced density matrix elements of a spin unit hop; i.e.,
+
+        <bra|i'(a)j'(b)i(b)j(a)|ket>
+
+        i.e.,
+
+        j ---a---> i
+        i ---b---> j
+
+        and conjugate transpose
+        '''
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
         raise NotImplementedError
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_1s, self.dw_1s = self.dt_1s + dt, self.dw_1s + dw
+        return
 
     def _crunch_1s1c_(self, bra, ket, i, j, k):
+        '''Compute the reduced density matrix elements of a spin-charge unit hop; i.e.,
+
+        <bra|i'(a)k'(b)j(b)k(a)|ket>
+
+        i.e.,
+
+        k ---a---> i
+        j ---b---> k
+
+        and conjugate transpose
+        '''
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
         raise NotImplementedError
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_1s1c, self.dw_1s1c = self.dt_1s1c + dt, self.dw_1s1c + dw
+        return
 
     def _crunch_2c_(self, bra, ket, i, j, k, l, s2lt):
+        '''Compute the reduced density matrix elements of a two-electron hop; i.e.,
+
+        <bra|i'(s1)k'(s2)l(s2)j(s1)|ket>
+
+        i.e.,
+
+        j ---s1---> i
+        l ---s2---> k
+
+        with
+
+        s2lt = 0, 1, 2
+        s1   = a, a, b
+        s2   = a, b, b
+
+        and conjugate transpose
+
+        Note that this includes i=k and/or j=l cases, but no other coincident fragment indices. Any
+        other coincident fragment index (that is, any coincident index between the bra and the ket)
+        turns this into one of the other interactions implemented in the above _crunch_ functions:
+        s1 = s2  AND SORT (ik) = SORT (jl)                 : _crunch_1d_ and _crunch_2d_
+        s1 = s2  AND (i = j XOR i = l XOR j = k XOR k = l) : _crunch_1c_ and _crunch_1c1d_
+        s1 != s2 AND (i = l AND j = k)                     : _crunch_1s_
+        s1 != s2 AND (i = l XOR j = k)                     : _crunch_1s1c_
+        '''
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
         raise NotImplementedError
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_2c, self.dw_2c = self.dt_2c + dt, self.dw_2c + dw
+        return
 
     def env_addr_fragpop (self, bra, i, r):
         rem = 0
@@ -130,15 +217,6 @@ def _get_vecs_(self, bra, ket):
             hci_f_ab[i] = np.zeros_like (hci_r_pabq[bra_r][addr,:,:,ket])
         return hci_f_ab, excfrags
 
-    def _put_vecs_(self, bra, ket, vecs, *inv):
-        t0, w0 = logger.process_clock (), logger.perf_counter ()
-        bras, kets, facs = self._get_spec_addr_ovlp (bra, ket, *inv)
-        for bra, ket, fac in zip (bras, kets, facs):
-            hci_r_pabq = self.hci_fr_pabq[i]
-            # TODO: buffer
-            hci_f_ab[i] = np.zeros_like (hci_r_pabq[bra_r][addr,:,:,ket])
-        return hci_f_ab, excfrags
-
     def _put_vecs_(self, bra, ket, vecs, *inv):
         t0, w0 = logger.process_clock (), logger.perf_counter ()
         bras, kets, facs = self._get_spec_addr_ovlp (bra, ket, *inv)
@@ -214,7 +292,6 @@ def contract_ham_ci (las, h1, h2, ci_fr_ket, nelec_frs_ket, ci_fr_bra, nelec_frs
             ndeta_bra[i,j],ndetb_bra[i,j],ndim_ket).
     '''
     log = lib.logger.new_logger (las, las.verbose)
-    log = lib.logger.new_logger (las, las.verbose)
     nlas = las.ncas_sub
     nfrags, nbra = nelec_frs_bra.shape[:2]
     nket = nelec_frs_ket.shape[1]
@@ -233,6 +310,79 @@ def contract_ham_ci (las, h1, h2, ci_fr_ket, nelec_frs_ket, ci_fr_bra, nelec_frs
     hket_fr_pabq, t0 = contracter.kernel ()
     lib.logger.timer (las, 'LASSI Hamiltonian contraction second intermediate crunching', *t0)
 
+    #for ifrag in range (nfrags):
+    #    gen_hket = gen_contract_ham_ci_const (ifrag, nbra, las, h1, h2, ci, nelec_frs, soc=soc,
+    #                                          orbsym=orbsym, wfnsym=wfnsym)
+    #    for ibra, hket_pabq in enumerate (gen_hket):
+    #        hket_fr_pabq[ifrag][ibra][:] += hket_pabq[:]
     return hket_fr_pabq
 
+def gen_contract_ham_ci_const (ifrag, nbra, las, h1, h2, ci, nelec_frs, soc=0, orbsym=None,
+                               wfnsym=None):
+    '''Constant-term parts of contract_ham_ci for fragment ifrag'''
+    log = lib.logger.new_logger (las, las.verbose)
+    nlas = las.ncas_sub
+    nfrags, nroots = nelec_frs.shape[:2]
+    nket = nroots - nbra
+    dtype = ci[0][0].dtype
+    max_memory = getattr (las, 'max_memory', las.mol.max_memory)
+
+    lroots = get_lroots (ci)
+    nprods_ket = np.sum (np.prod (lroots[:,:-nbra], axis=0))
+    norb_i = nlas[ifrag]
+    ci_i = ci[ifrag]
+    nelec_i_rs = nelec_frs[ifrag]
+
+    # index down to omit fragment
+    idx = np.ones (las.nfrags, dtype=bool)
+    idx[ifrag] = False
+    nelec_frs = nelec_frs[idx]
+    ci_jfrag = [c for i,c in enumerate (ci) if i != ifrag]
+    j = sum (nlas[:ifrag+1])
+    i = j - nlas[ifrag]
+    ix = np.ones (h1.shape[0], dtype=bool)
+    ix[i:j] = False
+    h1 = h1[np.ix_(ix,ix)]
+    h2 = h2[np.ix_(ix,ix,ix,ix)]
+    nlas = nlas[idx]
+
+    # First pass: single-fragment intermediates
+    hopping_index, ints, lroots = frag.make_ints (las, ci_jfrag, nelec_frs, nlas=nlas)
+    nstates = np.sum (np.prod (lroots, axis=0))
+        
+    # Memory check
+    current_memory = lib.current_memory ()[0]
+    required_memory = dtype.itemsize*nstates*nstates*3/1e6
+    if current_memory + required_memory > max_memory:
+        raise MemoryError ("current: {}; required: {}; max: {}".format (
+            current_memory, required_memory, max_memory))
+
+    # Second pass: upper-triangle
+    outerprod = hams2ovlp.HamS2Ovlp (ints, nlas, hopping_index, lroots, h1, h2, dtype=dtype,
+                                     mask_bra_space = list (range (nket, nroots)),
+                                     mask_ket_space = list (range (nket)),
+                                     max_memory=max_memory, log=log)
+    ham = outerprod.kernel ()[0]
+    for ibra in range (nket, nroots):
+        i, j = outerprod.offs_lroots[ibra]
+        nelec_i = nelec_i_rs[ibra]
+        ndeta = cistring.num_strings (norb_i, nelec_i[0])
+        ndetb = cistring.num_strings (norb_i, nelec_i[1])
+        hket_pabq = np.zeros ((nprods_ket, j-i, ndeta, ndetb),
+                              dtype=outerprod.dtype).transpose (1,2,3,0)
+        m = 0
+        for iket in range (nket):
+            if tuple (nelec_i_rs[iket]) != tuple (nelec_i): continue
+            k, l = outerprod.offs_lroots[iket]
+            hket = np.multiply.outer (ci_i[iket], ham[i:j,k:l]) # qabpq
+            if ci_i[iket].ndim == 2: hket = hket[None,...]
+            nq1, na, nb, np1, nq2 = hket.shape
+            new_shape = [nq1,na,nb,np1] + list (outerprod.lroots[::-1,iket])
+            hket = np.moveaxis (hket.reshape (new_shape), 0, -ifrag)
+            new_shape = [na,nb,np1,nq1*nq2]
+            hket = hket.reshape (new_shape).transpose (2,0,1,3)
+            n = m + nq1*nq2
+            hket_pabq[:,:,:,m:n] = hket[:]
+            m = n
+        yield hket_pabq
 
diff --git a/my_pyscf/lassi/op_o1/rdm.py b/my_pyscf/lassi/op_o1/rdm.py
index ffe217ab..5a6b2abd 100644
--- a/my_pyscf/lassi/op_o1/rdm.py
+++ b/my_pyscf/lassi/op_o1/rdm.py
@@ -14,8 +14,8 @@
 def c_arr (arr): return arr.ctypes.data_as(ctypes.c_void_p)
 c_int = ctypes.c_int
 
-class LSTDMint1 (frag.LSTDMint1):
-    __doc__ = frag.LSTDMint1.__doc__ + '''
+class FragTDMInt (frag.FragTDMInt):
+    __doc__ = frag.FragTDMInt.__doc__ + '''
 
     Transpose the transition density matrix arrays so that the lroots indices
     are fastest-moving, since RDMs require contracting over these indices
@@ -25,8 +25,8 @@ def setmanip (self, x):
         x = np.ascontiguousarray (np.moveaxis (x, [0,1], [-2,-1]))
         return np.moveaxis (x, [-2,-1], [0,1])
 
-class LRRDMint (stdm.LSTDMint2):
-    __doc__ = stdm.LSTDMint2.__doc__ + '''
+class LRRDM (stdm.LSTDM):
+    __doc__ = stdm.LSTDM.__doc__ + '''
 
     SUBCLASS: LASSI-root reduced density matrix
 
@@ -43,7 +43,7 @@ class LRRDMint (stdm.LSTDMint2):
 
     def __init__(self, ints, nlas, hopping_index, lroots, si, mask_bra_space=None,
                  mask_ket_space=None, log=None, max_memory=2000, dtype=np.float64):
-        stdm.LSTDMint2.__init__(self, ints, nlas, hopping_index, lroots,
+        stdm.LSTDM.__init__(self, ints, nlas, hopping_index, lroots,
                                 mask_bra_space=mask_bra_space,
                                 mask_ket_space=mask_ket_space,
                                 log=log, max_memory=max_memory,
@@ -587,7 +587,7 @@ def get_fdm1_maker (las, ci, nelec_frs, si, **kwargs):
     nstates = np.sum (np.prod (lroots, axis=0))
         
     # Second pass: upper-triangle
-    outerprod = LRRDMint (ints, nlas, hopping_index, lroots, si, dtype=dtype,
+    outerprod = LRRDM (ints, nlas, hopping_index, lroots, si, dtype=dtype,
                           max_memory=max_memory, log=log)
 
     # Spoof nonuniq_exc to avoid summing together things that need to be separate
@@ -627,7 +627,7 @@ def roots_make_rdm12s (las, ci, nelec_frs, si, **kwargs):
     dtype = ci[0][0].dtype
 
     # First pass: single-fragment intermediates
-    hopping_index, ints, lroots = frag.make_ints (las, ci, nelec_frs, _LSTDMint1_class=LSTDMint1)
+    hopping_index, ints, lroots = frag.make_ints (las, ci, nelec_frs, _FragTDMInt_class=FragTDMInt)
     nstates = np.sum (np.prod (lroots, axis=0))
     
     # Memory check
@@ -639,7 +639,7 @@ def roots_make_rdm12s (las, ci, nelec_frs, si, **kwargs):
 
     # Second pass: upper-triangle
     t0 = (lib.logger.process_clock (), lib.logger.perf_counter ())
-    outerprod = LRRDMint (ints, nlas, hopping_index, lroots, si, dtype=dtype,
+    outerprod = LRRDM (ints, nlas, hopping_index, lroots, si, dtype=dtype,
                           max_memory=max_memory, log=log)
     lib.logger.timer (las, 'LASSI root RDM12s second intermediate indexing setup', *t0)
     rdm1s, rdm2s, t0 = outerprod.kernel ()
diff --git a/my_pyscf/lassi/op_o1/stdm.py b/my_pyscf/lassi/op_o1/stdm.py
index 9ee65c93..b7ffb083 100644
--- a/my_pyscf/lassi/op_o1/stdm.py
+++ b/my_pyscf/lassi/op_o1/stdm.py
@@ -14,14 +14,14 @@ def c_arr (arr): return arr.ctypes.data_as(ctypes.c_void_p)
 c_int = ctypes.c_int
 
 def mask_exc_table (exc, col=0, mask_space=None):
-    if mask_space is None: return exc
+    if mask_space is None: return np.ones (exc.shape[0], dtype=bool)
     mask_space = np.asarray (mask_space)
     if mask_space.dtype in (bool, np.bool_):
         mask_space = np.where (mask_space)[0]
     idx = np.isin (exc[:,col], mask_space)
-    return exc[idx]
+    return idx
 
-class LSTDMint2 (object):
+class LSTDM (object):
     ''' LAS state transition density matrix intermediate 2 - whole-system DMs
         Carry out multiplications such as
 
@@ -48,7 +48,7 @@ class LSTDMint2 (object):
         things (i.e. operators or DMs in different basis).
 
         Args:
-            ints : list of length nfrags of instances of :class:`LSTDMint1`
+            ints : list of length nfrags of instances of :class:`FragTDMInt`
                 fragment-local intermediates
             nlas : list of length nfrags of integers
                 numbers of active orbitals in each fragment
@@ -155,7 +155,7 @@ def make_exc_tables (self, hopping_index):
 
         Returns:
             exc: dict with str keys and ndarray-of-int values. Each row of each ndarray is the
-                argument list for 1 call to the LSTDMint2._crunch_*_ function with the name that
+                argument list for 1 call to the LSTDM._crunch_*_ function with the name that
                 corresponds to the key str (_crunch_1d_, _crunch_1s_, etc.).
         '''
         exc = {}
@@ -289,9 +289,12 @@ def make_exc_tables (self, hopping_index):
         return exc
 
     def mask_exc_table_(self, exc, lbl, mask_bra_space=None, mask_ket_space=None):
-        # Part 1: restrict to the caller-specified rectangle
-        exc = mask_exc_table (exc, col=0, mask_space=mask_bra_space)
-        exc = mask_exc_table (exc, col=1, mask_space=mask_ket_space)
+        # Part 1: restrict to the caller-specified rectangle OR its transpose
+        idx1  = mask_exc_table (exc, col=0, mask_space=mask_bra_space)
+        idx1 &= mask_exc_table (exc, col=1, mask_space=mask_ket_space)
+        idx2  = mask_exc_table (exc, col=1, mask_space=mask_bra_space)
+        idx2 &= mask_exc_table (exc, col=0, mask_space=mask_ket_space)
+        exc = exc[idx1|idx2]
         # Part 2: identify interactions which are equivalent except for the overlap
         # factor of spectator fragments. Reduce the exc table only to the unique
         # interactions and populate self.nonuniq_exc with the corresponding
@@ -980,7 +983,7 @@ def make_stdm12s (las, ci, nelec_frs, **kwargs):
 
     # Second pass: upper-triangle
     t0 = (lib.logger.process_clock (), lib.logger.perf_counter ())
-    outerprod = LSTDMint2 (ints, nlas, hopping_index, lroots, dtype=dtype,
+    outerprod = LSTDM (ints, nlas, hopping_index, lroots, dtype=dtype,
                            max_memory=max_memory, log=log)
     lib.logger.timer (las, 'LAS-state TDM12s second intermediate indexing setup', *t0)
     tdm1s, tdm2s, t0 = outerprod.kernel ()