diff --git a/.github/workflows/docs-check.yml b/.github/workflows/docs-check.yml index 826278ff..1fffaa1f 100644 --- a/.github/workflows/docs-check.yml +++ b/.github/workflows/docs-check.yml @@ -4,7 +4,7 @@ on: pull_request jobs: docs: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 container: image: unhumbleben/nupack:latest steps: diff --git a/.github/workflows/run_unit_tests.yml b/.github/workflows/run_unit_tests.yml index fbfbc571..c78912ea 100644 --- a/.github/workflows/run_unit_tests.yml +++ b/.github/workflows/run_unit_tests.yml @@ -8,7 +8,7 @@ on: pull_request jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 container: image: unhumbleben/nupack:latest strategy: diff --git a/examples/hamming_dist_test.py b/examples/hamming_dist_test.py index e53f236f..abd55ce8 100644 --- a/examples/hamming_dist_test.py +++ b/examples/hamming_dist_test.py @@ -5,13 +5,13 @@ def main(): domain_length = 15 - # energy_constraint = dc.NearestNeighborEnergyConstraint(low_energy=-9.2, high_energy=-7) - numpy_constraints = [ # energy_constraint, - nc.RunsOfBasesConstraint(['C', 'G'], 4), - nc.RunsOfBasesConstraint(['A', 'T'], 4) + # energy_constraint = dc.NearestNeighborEnergyFilter(low_energy=-9.2, high_energy=-7) + numpy_filters = [ # energy_constraint, + nc.RunsOfBasesFilter(['C', 'G'], 4), + nc.RunsOfBasesFilter(['A', 'T'], 4) ] domain_pool = nc.DomainPool(f'length-{domain_length} domains', domain_length, - numpy_constraints=numpy_constraints, replace_with_close_sequences=True) + numpy_filters=numpy_filters, replace_with_close_sequences=True) random_seed = 0 strands = [nc.Strand([f'{i}' for i in range(1, 50)])] diff --git a/examples/many_strands_no_common_domains.py b/examples/many_strands_no_common_domains.py index 1b2245e0..c9f1e411 100644 --- a/examples/many_strands_no_common_domains.py +++ b/examples/many_strands_no_common_domains.py @@ -8,7 +8,7 @@ import nuad.constraints as nc # type: ignore import nuad.vienna_nupack as nv # type: ignore import nuad.search as ns # type: ignore -from nuad.constraints import NumpyConstraint +from nuad.constraints import NumpyFilter def f(x: int | float) -> float: @@ -76,17 +76,17 @@ def main() -> None: parallel = False # parallel = True - numpy_constraints: List[NumpyConstraint] = [ - nc.NearestNeighborEnergyConstraint(-9.3, -9.0, 52.0), - # nc.BaseCountConstraint(base='G', high_count=1), - # nc.BaseEndConstraint(bases=('C', 'G')), - # nc.RunsOfBasesConstraint(['C', 'G'], 4), - # nc.RunsOfBasesConstraint(['A', 'T'], 4), - # nc.BaseEndConstraint(bases=('A', 'T')), - # nc.BaseEndConstraint(bases=('C', 'G'), distance_from_end=1), - # nc.BaseAtPositionConstraint(bases='T', position=3), - # nc.ForbiddenSubstringConstraint(['GGGG', 'CCCC']), - # nc.RestrictBasesConstraint(bases=['A', 'T', 'C']), + numpy_filters: List[NumpyFilter] = [ + nc.NearestNeighborEnergyFilter(-9.3, -9.0, 52.0), + # nc.BaseCountFilter(base='G', high_count=1), + # nc.BaseEndFilter(bases=('C', 'G')), + # nc.RunsOfBasesFilter(['C', 'G'], 4), + # nc.RunsOfBasesFilter(['A', 'T'], 4), + # nc.BaseEndFilter(bases=('A', 'T')), + # nc.BaseEndFilter(bases=('C', 'G'), distance_from_end=1), + # nc.BaseAtPositionFilter(bases='T', position=3), + # nc.ForbiddenSubstringFilter(['GGGG', 'CCCC']), + # nc.RestrictBasesFilter(bases=['A', 'T', 'C']), ] # def nupack_binding_energy_in_bounds(seq: str) -> bool: @@ -95,18 +95,18 @@ def main() -> None: # return -11 < energy < -9 # # # list of functions: - # sequence_constraints: List[SequenceConstraint] = [ + # sequence_filters: List[SequenceFilter] = [ # # nupack_binding_energy_in_bounds, # ] replace_with_close_sequences = True # replace_with_close_sequences = False domain_pool_10 = nc.DomainPool(f'length-10_domains', 10, - numpy_constraints=numpy_constraints, + numpy_filters=numpy_filters, replace_with_close_sequences=replace_with_close_sequences, ) domain_pool_11 = nc.DomainPool(f'length-11_domains', 11, - numpy_constraints=numpy_constraints, + numpy_filters=numpy_filters, replace_with_close_sequences=replace_with_close_sequences, ) diff --git a/examples/sample_designer.py b/examples/sample_designer.py index 814f167f..8f71acf4 100644 --- a/examples/sample_designer.py +++ b/examples/sample_designer.py @@ -88,18 +88,18 @@ def main() -> None: else: design = initial_design - numpy_constraints = [ - nc.NearestNeighborEnergyConstraint(-9.5, -9.0, 52.0), - nc.BaseCountConstraint(base='G', high_count=1), - nc.RunsOfBasesConstraint(['C', 'G'], 4), - nc.RunsOfBasesConstraint(['A', 'T'], 4), + numpy_filters = [ + nc.NearestNeighborEnergyFilter(-9.5, -9.0, 52.0), + nc.BaseCountFilter(base='G', high_count=1), + nc.RunsOfBasesFilter(['C', 'G'], 4), + nc.RunsOfBasesFilter(['A', 'T'], 4), ] lengths = [9, 10, 11, 12] domain_pools = { length: nc.DomainPool(f'length-{length} domains', length, - numpy_constraints=numpy_constraints) for length in lengths + numpy_filters=numpy_filters) for length in lengths } for strand in [strand0, strand1]: diff --git a/examples/seesaw_gate.py b/examples/seesaw_gate.py index 4bb2f10b..17c9d220 100644 --- a/examples/seesaw_gate.py +++ b/examples/seesaw_gate.py @@ -17,30 +17,30 @@ TOEHOLD_COMPLEMENT = f'{TOEHOLD_DOMAIN}{COMPLEMENT_SUFFIX}' # Domain pools -forbidden_substring_constraints = [ - nc.ForbiddenSubstringConstraint(['G' * 4, 'C' * 4]), +forbidden_substring_filters = [ + nc.ForbiddenSubstringFilter(['G' * 4, 'C' * 4]), ] -non_sub_long_domain_constraints = [ - nc.RestrictBasesConstraint(('A', 'C', 'T')), - *forbidden_substring_constraints +non_sub_long_domain_filters = [ + nc.RestrictBasesFilter(('A', 'C', 'T')), + *forbidden_substring_filters ] -sub_long_domain_constraints: List[nc.NumpyConstraint] = [ - nc.RestrictBasesConstraint(('A', 'C', 'T')), +sub_long_domain_filters: List[nc.NumpyFilter] = [ + nc.RestrictBasesFilter(('A', 'C', 'T')), ] if SUB_LONG_DOMAIN_LENGTH > 3: - sub_long_domain_constraints.extend(forbidden_substring_constraints) + sub_long_domain_filters.extend(forbidden_substring_filters) SUB_LONG_DOMAIN_POOL: nc.DomainPool = nc.DomainPool('sub_long_domain_pool', SUB_LONG_DOMAIN_LENGTH, - numpy_constraints=sub_long_domain_constraints) + numpy_filters=sub_long_domain_filters) NON_SUB_LONG_DOMAIN_POOL: nc.DomainPool = nc.DomainPool('non_sub_long_domain_pool', NON_SUB_LONG_DOMAIN_LENGTH, - numpy_constraints=non_sub_long_domain_constraints) + numpy_filters=non_sub_long_domain_filters) toehold_domain_contraints = [ - nc.ForbiddenSubstringConstraint('G' * 4), - nc.ForbiddenSubstringConstraint('C' * 4) + nc.ForbiddenSubstringFilter('G' * 4), + nc.ForbiddenSubstringFilter('C' * 4) ] TOEHOLD_DOMAIN_POOL: nc.DomainPool = nc.DomainPool('toehold_domain_pool', 5) @@ -383,11 +383,11 @@ def reporter_base_strand(gate) -> nc.Strand: strand_complexes=[f_waste_6_complex]) -def four_g_constraint_evaluate(seqs: Tuple[str, ...], strand: Optional[nc.Strand]) -> Tuple[float, str]: +def four_g_constraint_evaluate(seqs: Tuple[str, ...], strand: Optional[nc.Strand]) -> nc.Result: seq = seqs[0] - score = 1000 if 'GGGG' in seq else 0 + excess = 1000 if 'GGGG' in seq else 0 violation_str = "" if 'GGGG' not in strand.sequence() else "** violation**" - return score, f"{strand.name}: {strand.sequence()}{violation_str}" + return nc.Result(excess=excess, summary=f"{strand.name}: {strand.sequence()}{violation_str}") def four_g_constraint_summary(strand: nc.Strand): @@ -408,8 +408,8 @@ def four_g_constraint_summary(strand: nc.Strand): waste_2_5_complex_constraint, reporter_6_complex_constraint, f_waste_6_complex_constraint, + four_g_constraint, ] -constraints.append(four_g_constraint) seesaw_design = nc.Design(strands=strands) diff --git a/examples/square_root_circuit.py b/examples/square_root_circuit.py index 55a74f83..c5c5136f 100644 --- a/examples/square_root_circuit.py +++ b/examples/square_root_circuit.py @@ -25,23 +25,23 @@ ILLEGAL_SUBSTRINGS = ILLEGAL_SUBSTRINGS_FOUR + ILLEGAL_SUBSTRINGS_FIVE # NumpyConstraints -three_letter_code_constraint = nc.RestrictBasesConstraint(('A', 'C', 'T')) -no_gggg_constraint = nc.ForbiddenSubstringConstraint(ILLEGAL_SUBSTRINGS_FOUR) -no_aaaaa_constraint = nc.ForbiddenSubstringConstraint(ILLEGAL_SUBSTRINGS_FIVE) -c_content_constraint = nc.BaseCountConstraint('C', floor(0.7 * SIGNAL_DOMAIN_LENGTH), - ceil(0.3 * SIGNAL_DOMAIN_LENGTH)) +three_letter_code_constraint = nc.RestrictBasesFilter(('A', 'C', 'T')) +no_gggg_constraint = nc.ForbiddenSubstringFilter(ILLEGAL_SUBSTRINGS_FOUR) +no_aaaaa_constraint = nc.ForbiddenSubstringFilter(ILLEGAL_SUBSTRINGS_FIVE) +c_content_constraint = nc.BaseCountFilter('C', floor(0.7 * SIGNAL_DOMAIN_LENGTH), + ceil(0.3 * SIGNAL_DOMAIN_LENGTH)) # Domain pools SUBDOMAIN_SS_POOL: nc.DomainPool = nc.DomainPool(f'SUBDOMAIN_SS_POOL', SIGNAL_DOMAIN_LENGTH - EXTENDED_TOEHOLD_LENGTH) SUBDOMAIN_S_POOL: nc.DomainPool = nc.DomainPool(f'SUBDOMAIN_S_POOL', EXTENDED_TOEHOLD_LENGTH) TOEHOLD_DOMAIN_POOL: nc.DomainPool = nc.DomainPool( - name='TOEHOLD_DOMAIN_POOL', length=TOEHOLD_LENGTH, numpy_constraints=[three_letter_code_constraint]) + name='TOEHOLD_DOMAIN_POOL', length=TOEHOLD_LENGTH, numpy_filters=[three_letter_code_constraint]) SIGNAL_DOMAIN_POOL: nc.DomainPool = nc.DomainPool( name='SIGNAL_DOMAIN_POOL', length=SIGNAL_DOMAIN_LENGTH, - numpy_constraints=[three_letter_code_constraint, c_content_constraint, no_aaaaa_constraint, - no_gggg_constraint]) + numpy_filters=[three_letter_code_constraint, c_content_constraint, no_aaaaa_constraint, + no_gggg_constraint]) # Alias dc_complex_constraint = nc.nupack_complex_base_pair_probability_constraint @@ -365,7 +365,7 @@ def base_difference_constraint(domains: Iterable[nc.Domain]) -> nc.DomainPairCon """ def evaluate(seqs: Tuple[str, ...], domain_pair: Optional[nc.DomainPair]) \ - -> Tuple[float, str]: + -> nc.Result: seq1, seq2 = seqs if domain_pair is not None: domain1, domain2 = domain_pair.domain1, domain_pair.domain2 @@ -401,7 +401,7 @@ def evaluate(seqs: Tuple[str, ...], domain_pair: Optional[nc.DomainPair]) \ f'\t{domain1}: {domain1.sequence}\n' f'\t{domain2}: {domain2.sequence}\n') - return result, summary + return nc.Result(excess=result, summary=summary) pairs = itertools.combinations(domains, 2) @@ -430,7 +430,7 @@ def violated(seq: str): return True return False - def evaluate(seqs: Tuple[str, ...], strand: Optional[nc.Strand]) -> Tuple[float, str]: + def evaluate(seqs: Tuple[str, ...], strand: Optional[nc.Strand]) -> nc.Result: seq = seqs[0] if violated(seq): violation_str = '** violation**' @@ -438,7 +438,7 @@ def evaluate(seqs: Tuple[str, ...], strand: Optional[nc.Strand]) -> Tuple[float, else: violation_str = '' score = 0 - return score, f"{strand.name}: {strand.sequence()}{violation_str}" + return nc.Result(excess=score, summary=f"{strand.name}: {strand.sequence()}{violation_str}") return nc.StrandConstraint(description="Strand Substring Constraint", short_description="Strand Substring Constraint", diff --git a/examples/sst_canvas.py b/examples/sst_canvas.py index 1154b049..d94fc877 100644 --- a/examples/sst_canvas.py +++ b/examples/sst_canvas.py @@ -123,14 +123,14 @@ def create_design(width: int, height: int) -> nc.Design: :return: design with `width` x `height` canvas of SSTs """ - numpy_constraints = [ - nc.NearestNeighborEnergyConstraint(-9.3, -9.0, 52.0), # energies should all be "close" - nc.RunsOfBasesConstraint(['C', 'G'], 4), # forbid substrings of form {C,G}^4 - nc.ForbiddenSubstringConstraint(['AAAAA', 'TTTTT']), # forbid 5 A's in a row or 5 T's in a row + numpy_filters = [ + nc.NearestNeighborEnergyFilter(-9.3, -9.0, 52.0), # energies should all be "close" + nc.RunsOfBasesFilter(['C', 'G'], 4), # forbid substrings of form {C,G}^4 + nc.ForbiddenSubstringFilter(['AAAAA', 'TTTTT']), # forbid 5 A's in a row or 5 T's in a row ] - domain_pool_10 = nc.DomainPool(f'length-10_domains', 10, numpy_constraints=numpy_constraints) - domain_pool_11 = nc.DomainPool(f'length-11_domains', 11, numpy_constraints=numpy_constraints) + domain_pool_10 = nc.DomainPool(f'length-10_domains', 10, numpy_filters=numpy_filters) + domain_pool_11 = nc.DomainPool(f'length-11_domains', 11, numpy_filters=numpy_filters) design = nc.Design() diff --git a/nuad/__version__.py b/nuad/__version__.py index 2deeb7fa..fe38fe1f 100644 --- a/nuad/__version__.py +++ b/nuad/__version__.py @@ -1 +1 @@ -version = '0.3.5' # version line; WARNING: do not remove or change this line or comment +version = '0.4.0' # version line; WARNING: do not remove or change this line or comment diff --git a/nuad/constraints.py b/nuad/constraints.py index 4ea052cc..c0a1f423 100644 --- a/nuad/constraints.py +++ b/nuad/constraints.py @@ -8,8 +8,8 @@ Also important are two other types of constraints (not subclasses of :any:`Constraint`), which are used prior to the search to determine if it is even -legal to use a DNA sequence: subclasses of the abstract base class :any:`NumpyConstraint`, -and :any:`SequenceConstraint`, an alias for a function taking a string as input and returning a bool. +legal to use a DNA sequence: subclasses of the abstract base class :any:`NumpyFilter`, +and :any:`SequenceFilter`, an alias for a function taking a string as input and returning a bool. See the README on the GitHub page for more detailed explaination of these classes: https://github.com/UC-Davis-molecular-computing/dsd#data-model @@ -351,58 +351,71 @@ def all_pairs_iterator(values: Iterable[T], return it -SequenceConstraint = Callable[[str], bool] +SequenceFilter = Callable[[str], bool] """ -Constraint that applies to a DNA sequence; the difference between this an a :any:`DomainConstraint` is +Filter (see description of :any:`NumpyFilter` for explanation of the term "filter") +that applies to a DNA sequence; the difference between this an a :any:`DomainConstraint` is that these are applied before a sequence is assigned to a :any:`Domain`, so the constraint can only be based on the DNA sequence, and not, for instance, on the :any:`Domain`'s :any:`DomainPool`. -Consequently :any:`SequenceConstraint`'s, like :any:`NumpyConstraint`'s, are treated differently than -subtypes of :any:`Constraint`, since a DNA sequence failing any :any:`SequenceConstraint`'s or -:any:`NumpyConstraint`'s is never allowed to be assigned into any :any:`Domain`. +Consequently :any:`SequenceFilter`'s, like :any:`NumpyFilter`'s, are treated differently than +subtypes of :any:`Constraint`, since a DNA sequence failing any :any:`SequenceFilter`'s or +:any:`NumpyFilter`'s is never allowed to be assigned into any :any:`Domain`. -The difference with :any:`NumpyConstraint` is that a :any:`NumpyConstraint` requires one to express the +The difference with :any:`NumpyFilter` is that a :any:`NumpyFilter` requires one to express the constraint in a way that is efficient for the linear algebra operations of numpy. If you cannot figure out -how to do this, a :any:`SequenceConstraint` can be expressed in pure Python, but typically will be much -slower to apply than a :any:`NumpyConstraint`. +how to do this, a :any:`SequenceFilter` can be expressed in pure Python, but typically will be much +slower to apply than a :any:`NumpyFilter`. """ # The Mypy error being ignored is a bug and is described here: # https://github.com/python/mypy/issues/5374#issuecomment-650656381 @dataclass # type: ignore -class NumpyConstraint(ABC): +class NumpyFilter(ABC): """ - Abstract base class for numpy constraints. These are constraints that can be efficiently encoded + Abstract base class for numpy filters. A "filter" is a hard constraint applied to sequences + for a :any:`Domain`; a sequence not passing the filter is never allowed to be assigned to + a :any:`Domain`. This constrasts with the various subclasses of :any:`Constraint`, which + are different in two ways: 1) they can apply to large parts of the design than just a domain, + e.g., a :any:`Strand` or a pair of :any:`Domain`'s, and 2) they are "soft" constraint that are + allowed to be violated during the course of the search. + + A :any:`NumpyFilter` is one that can be efficiently encoded as numpy operations on 2D arrays of bytes representing DNA sequences, through the class :any:`np.DNASeqList` (which uses such a 2D array as the field :py:data:`np.DNASeqList.seqarr`). - Subclasses should set the value self.name, inherited from this class. + Subclasses should set the value :data:`NumpyFilter.name`, inherited from this class. - Pre-made subclasses of :any:`NumpyConstraint` provided in this library, - such as :any:`RestrictBasesConstraint` or :any:`NearestNeighborEnergyConstraint`, + Pre-made subclasses of :any:`NumpyFilter` provided in this library, + such as :any:`RestrictBasesFilter` or :any:`NearestNeighborEnergyFilter`, are dataclasses (https://docs.python.org/3/library/dataclasses.html). - There is no requirement that your custom subclasses be dataclasses, but since the subclasses will - inherit the field :py:data:`NumpyConstraint.name`, you can easily make them dataclasses to get, - for example, free ``repr`` and ``str`` implementations. See the source code for the example subclasses. + There is no requirement that custom subclasses be dataclasses, but since the subclasses will + inherit the field :py:data:`NumpyFilter.name`, you can easily make them dataclasses to get, + for example, free ``repr`` and ``str`` implementations. See the source code for examples. + + The related type :any:`SequenceFilter` (which is just an alias for a Python function with + a certain signature) has a similar purpose, but is used for filters that cannot be encoded + as numpy operations. Since they are applied by running a Python loop, they are much slower + to evaluate than a :any:`NumpyFilter`. """ - name: str = field(init=False, default='TODO: give a concrete name to this NumpyConstraint') - """Name of this :any:`NumpyConstraint`.""" + name: str = field(init=False, default='TODO: give a concrete name to this NumpyFilter') + """Name of this :any:`NumpyFilter`.""" @abstractmethod def remove_violating_sequences(self, seqs: nn.DNASeqList) -> nn.DNASeqList: """ Subclasses should override this method. - Since these are constraints that use numpy, generally they will access the numpy ndarray instance + Since these are filters that use numpy, generally they will access the numpy ndarray instance `seqs.seqarr`, operate on it, and then create a new :any:`np.DNASeqList` instance via the constructor :any:`np.DNASeqList` taking an numpy ndarray as input. See the source code of included constraints for examples, such as - :py:meth:`NearestNeighborEnergyConstraint.remove_violating_sequences` + :py:meth:`NearestNeighborEnergyFilter.remove_violating_sequences` or - :py:meth:`BaseCountConstraint.remove_violating_sequences`. + :py:meth:`BaseCountFilter.remove_violating_sequences`. These are usually quite tricky to write, requiring one to think in terms of linear algebra operations. The code tends not to be easy to read. But when a constraint can be expressed in this way, it is typically *very* fast to apply; many millions of sequences can @@ -414,11 +427,11 @@ def remove_violating_sequences(self, seqs: nn.DNASeqList) -> nn.DNASeqList: a new :any:`np.DNASeqList` object representing the DNA sequences in `seqs` that satisfy the constraint """ - pass + raise NotImplementedError() @dataclass -class RestrictBasesConstraint(NumpyConstraint): +class RestrictBasesFilter(NumpyFilter): """ Restricts the sequence to use only a subset of bases. This can be used to implement a so-called "three-letter code", for instance, in which a certain subset of :any:`Strand` uses only the @@ -431,9 +444,9 @@ class RestrictBasesConstraint(NumpyConstraint): https://science.sciencemag.org/content/332/6034/1196, http://www.qianlab.caltech.edu/seesaw_digital_circuits2011_SI.pdf - Note, however, that this is a constraint :any:`Domain`'s, not :any:`Strand`'s, so for a three-letter - code to work, you must take care not to mixed :any:`Domain`'s on a :any:`Strand` that will use - different alphabets. + Note, however, that this is a filter for :any:`Domain`'s, not whole :any:`Strand`'s, + so for a three-letter code to work, you must take care not to mixed :any:`Domain`'s on a + :any:`Strand` that will use different alphabets. """ # noqa bases: Collection[str] @@ -454,7 +467,7 @@ def remove_violating_sequences(self, seqs: nn.DNASeqList) -> nn.DNASeqList: @dataclass -class NearestNeighborEnergyConstraint(NumpyConstraint): +class NearestNeighborEnergyFilter(NumpyFilter): """ This constraint calculates the nearest-neighbor binding energy of a domain with its perfect complement (summing over all length-2 substrings of the domain's sequence), @@ -462,8 +475,8 @@ class NearestNeighborEnergyConstraint(NumpyConstraint): (https://www.annualreviews.org/doi/abs/10.1146/annurev.biophys.32.110601.141800, see Table 1, and example on page 419). It rejects any sequences whose energy according to this sum is outside the range - [:py:data:`NearestNeighborEnergyConstraint.low_energy`, - :py:data:`NearestNeighborEnergyConstraint.high_energy`]. + [:py:data:`NearestNeighborEnergyFilter.low_energy`, + :py:data:`NearestNeighborEnergyFilter.high_energy`]. """ low_energy: float @@ -490,7 +503,7 @@ def remove_violating_sequences(self, seqs: nn.DNASeqList) -> nn.DNASeqList: @dataclass -class BaseCountConstraint(NumpyConstraint): +class BaseCountFilter(NumpyFilter): """ Restricts the sequence to contain a certain number of occurences of a given base. """ @@ -500,12 +513,12 @@ class BaseCountConstraint(NumpyConstraint): high_count: int | None = None """ - Count of :py:data:`BaseCountConstraint.base` must be at most :py:data:`BaseCountConstraint.high_count`. + Count of :py:data:`BaseCountFilter.base` must be at most :py:data:`BaseCountFilter.high_count`. """ low_count: int | None = None """ - Count of :py:data:`BaseCountConstraint.base` must be at least :py:data:`BaseCountConstraint.low_count`. + Count of :py:data:`BaseCountFilter.base` must be at least :py:data:`BaseCountFilter.low_count`. """ def __post_init__(self) -> None: @@ -524,10 +537,10 @@ def remove_violating_sequences(self, seqs: nn.DNASeqList) -> nn.DNASeqList: @dataclass -class BaseEndConstraint(NumpyConstraint): +class BaseEndFilter(NumpyFilter): """ Restricts the sequence to contain only certain bases on - (or near, if :py:data:`BaseEndConstraint.distance` > 0) each end. + (or near, if :py:data:`BaseEndFilter.distance` > 0) each end. """ bases: Collection[str] @@ -591,7 +604,7 @@ def remove_violating_sequences(self, seqs: nn.DNASeqList) -> nn.DNASeqList: @dataclass -class BaseAtPositionConstraint(NumpyConstraint): +class BaseAtPositionFilter(NumpyFilter): """ Restricts the sequence to contain only certain base(s) on at a particular position. @@ -634,7 +647,7 @@ def remove_violating_sequences(self, seqs: nn.DNASeqList) -> nn.DNASeqList: @dataclass -class ForbiddenSubstringConstraint(NumpyConstraint): +class ForbiddenSubstringFilter(NumpyFilter): """ Restricts the sequence not to contain a certain substring(s), e.g., GGGG. """ @@ -649,7 +662,7 @@ class ForbiddenSubstringConstraint(NumpyConstraint): indices: Sequence[int] | None = None """ - Indices at which to check for each substring in :data:`ForbiddenSubstringConstraint.substrings`. + Indices at which to check for each substring in :data:`ForbiddenSubstringFilter.substrings`. If not specified, all appropriate indices are checked. """ @@ -686,7 +699,7 @@ def length(self) -> int: return len(first_substring) def remove_violating_sequences(self, seqs: nn.DNASeqList) -> nn.DNASeqList: - """Remove sequences that have a string in :py:data:`ForbiddenSubstringConstraint.substrings` + """Remove sequences that have a string in :py:data:`ForbiddenSubstringFilter.substrings` as a substring.""" assert isinstance(self.substrings, list) sub_len = len(self.substrings[0]) @@ -704,14 +717,14 @@ def remove_violating_sequences(self, seqs: nn.DNASeqList) -> nn.DNASeqList: @dataclass -class RunsOfBasesConstraint(NumpyConstraint): +class RunsOfBasesFilter(NumpyFilter): """ Restricts the sequence not to contain runs of a certain length from a certain subset of bases, (e.g., forbidding any substring in {C,G}^3; no four bases can appear in a row that are either C or G) This works by simply generating all strings representing the runs of bases, - and then using a :any:`ForbiddenSubstringConstraint` with those strings. So this will not be efficient + and then using a :any:`ForbiddenSubstringFilter` with those strings. So this will not be efficient for forbidding, for example {A,C,T}^20 (i.e., all runs of A's, C's, or T's of length 20), which would generate all 3^20 = 3,486,784,401 strings of length 20 from the alphabet {A,C,T}^20. Hopefully such a constraint would not be used in practice. @@ -719,7 +732,7 @@ class RunsOfBasesConstraint(NumpyConstraint): bases: Collection[str] """ - Bases to forbid in runs of length :py:data:`RunsOfBasesConstraint.length`. + Bases to forbid in runs of length :py:data:`RunsOfBasesFilter.length`. """ length: int @@ -742,17 +755,17 @@ def __init__(self, bases: str | Collection[str], length: int) -> None: raise ValueError(f'length must be positive, but it is {self.length}') if self.length == 1: allowed_bases = all_dna_bases - set(self.bases) - logger.warning('You have specified a RunsOfBasesConstraint with length = 1. ' + logger.warning('You have specified a RunsOfBasesFilter with length = 1. ' 'Although this will work, it essentially says to forbid using any of the bases ' f'in {set(self.bases)}, i.e., only use bases in {allowed_bases}. ' f'It is more efficient to use the constraint ' - f'RestrictBasesConstraint({allowed_bases}).') + f'RestrictBasesFilter({allowed_bases}).') def remove_violating_sequences(self, seqs: nn.DNASeqList) -> nn.DNASeqList: """Remove sequences that have a run of given length of bases from given bases.""" substrings = list( map(lambda lst: ''.join(lst), itertools.product(self.bases, repeat=self.length))) - constraint = ForbiddenSubstringConstraint(substrings) + constraint = ForbiddenSubstringFilter(substrings) return constraint.remove_violating_sequences(seqs) @@ -926,7 +939,7 @@ class DomainPool(JSONSerializable): This is an explicit list of sequences to consider for :any:`Domain`'s using this :any:`DomainPool`. During the search, if a domain with this :any:`DomainPool` is picked to have its sequence changed, then a sequence will be picked uniformly at random from this list. Note that no - :any:`NumpyConstraint`'s or :any:`SequenceConstraint`'s will be applied. + :any:`NumpyFilter`'s or :any:`SequenceFilter`'s will be applied. Alternatively, the field can be an instance of :any:`SubstringSampler` for the common case that the set of possible sequences is simple all (or many) substrings of a single longer sequence. @@ -950,31 +963,31 @@ class DomainPool(JSONSerializable): number of bases different from the previous sequence (Hamming distance). """ - numpy_constraints: List[NumpyConstraint] = field( + numpy_filters: List[NumpyFilter] = field( compare=False, hash=False, default_factory=list, repr=False) """ - :any:`NumpyConstraint`'s shared by all :any:`Domain`'s in this :any:`DomainPool`. + :any:`NumpyFilter`'s shared by all :any:`Domain`'s in this :any:`DomainPool`. This is used to choose potential sequences to assign to the :any:`Domain`'s in this :any:`DomainPool` in the method :py:meth:`DomainPool.generate_sequence`. - The difference with :py:data:`DomainPool.sequence_constraints` is that these constraints can be applied + The difference with :py:data:`DomainPool.sequence_filters` is that these constraints can be applied efficiently to many sequences at once, represented as a numpy 2D array of bytes (via the class :any:`np.DNASeqList`), so they are done in large batches in advance. - In contrast, the constraints in :py:data:`DomainPool.sequence_constraints` are done on Python strings + In contrast, the constraints in :py:data:`DomainPool.sequence_filters` are done on Python strings representing DNA sequences, and they are called one at a time when a new sequence is requested in :py:meth:`DomainPool.generate_sequence`. Optional; default is empty. """ - sequence_constraints: List[SequenceConstraint] = field( + sequence_filters: List[SequenceFilter] = field( compare=False, hash=False, default_factory=list, repr=False) """ - :any:`SequenceConstraint`'s shared by all :any:`Domain`'s in this :any:`DomainPool`. + :any:`SequenceFilter`'s shared by all :any:`Domain`'s in this :any:`DomainPool`. This is used to choose potential sequences to assign to the :any:`Domain`'s in this :any:`DomainPool` in the method :py:meth:`DomainPool.generate`. - See :py:data:`DomainPool.numpy_constraints` for an explanation of the difference between them. + See :py:data:`DomainPool.numpy_filters` for an explanation of the difference between them. See :py:data:`DomainPool.domain_constraints` for an explanation of the difference between them. @@ -1000,11 +1013,11 @@ def __post_init__(self) -> None: f'and sequence "{seq}", index {idx} in the list possible_sequences,\n' f'has length {len(seq)}.') - if len(self.numpy_constraints) > 0: - raise ValueError('If possible_sequences is specified, then numpy_constraints should ' + if len(self.numpy_filters) > 0: + raise ValueError('If possible_sequences is specified, then numpy_filters should ' 'not be specified.') - if len(self.sequence_constraints) > 0: - raise ValueError('If possible_sequences is specified, then sequence_constraints should ' + if len(self.sequence_filters) > 0: + raise ValueError('If possible_sequences is specified, then sequence_filters should ' 'not be specified.') if self.length is not None: @@ -1025,31 +1038,31 @@ def __post_init__(self) -> None: self.hamming_probability[length] /= total idx = 0 - for numpy_constraint in self.numpy_constraints: - if not isinstance(numpy_constraint, NumpyConstraint): - raise ValueError('each element of numpy_constraints must be an instance of ' - 'NumpyConstraint, ' + for numpy_constraint in self.numpy_filters: + if not isinstance(numpy_constraint, NumpyFilter): + raise ValueError('each element of numpy_filters must be an instance of ' + 'NumpyFilter, ' f'but the element at index {idx} is of type {type(numpy_constraint)}') - elif isinstance(numpy_constraint, RunsOfBasesConstraint): + elif isinstance(numpy_constraint, RunsOfBasesFilter): if numpy_constraint.length > self.length: raise ValueError(f'DomainPool "{self.name}" has length {self.length}, but a ' - f'RunsOfBasesConstraint was specified with larger length ' + f'RunsOfBasesFilter was specified with larger length ' f'{numpy_constraint.length}, which is not allowed') - elif isinstance(numpy_constraint, ForbiddenSubstringConstraint): + elif isinstance(numpy_constraint, ForbiddenSubstringFilter): if numpy_constraint.length() > self.length: raise ValueError(f'DomainPool "{self.name}" has length {self.length}, but a ' - f'ForbiddenSubstringConstraint was specified with larger length ' + f'ForbiddenSubstringFilter was specified with larger length ' f'{numpy_constraint.length()}, which is not allowed') idx += 1 idx = 0 - for seq_constraint in self.sequence_constraints: - # SequenceConstraint is an alias for Callable[[str], float], + for seq_constraint in self.sequence_filters: + # SequenceFilter is an alias for Callable[[str], float], # which is not checkable using isinstance # https://stackoverflow.com/questions/624926/how-do-i-detect-whether-a-python-variable-is-a-function if not callable(seq_constraint): - raise ValueError('each element of numpy_constraints must be an instance of ' - 'SequenceConstraint (i.e., be a function that takes a single string ' + raise ValueError('each element of numpy_filters must be an instance of ' + 'SequenceFilter (i.e., be a function that takes a single string ' 'and returns a bool), ' f'but the element at index {idx} is of type {type(seq_constraint)}') idx += 1 @@ -1116,7 +1129,7 @@ def from_json_serializable(json_map: Dict[str, Any]) -> DomainPool: def _first_sequence_satisfying_sequence_constraints(self, seqs: nn.DNASeqList) -> str | None: if len(seqs) == 0: return None - if len(self.sequence_constraints) == 0: + if len(self.sequence_filters) == 0: return seqs.get_seq_str(0) for idx in range(seqs.numseqs): seq = seqs.get_seq_str(idx) @@ -1129,18 +1142,18 @@ def satisfies_sequence_constraints(self, sequence: str) -> bool: :param sequence: DNA sequence to check :return: - whether `sequence` satisfies all constraints in :data:`DomainPool.sequence_constraints` + whether `sequence` satisfies all constraints in :data:`DomainPool.sequence_filters` """ - return all(constraint(sequence) for constraint in self.sequence_constraints) + return all(constraint(sequence) for constraint in self.sequence_filters) def generate_sequence(self, rng: np.random.Generator, previous_sequence: str | None = None) -> str: """ - Returns a DNA sequence of given length satisfying :py:data:`DomainPool.numpy_constraints` and - :py:data:`DomainPool.sequence_constraints` + Returns a DNA sequence of given length satisfying :py:data:`DomainPool.numpy_filters` and + :py:data:`DomainPool.sequence_filters` **Note:** By default, there is no check that the sequence returned is unequal to one already - assigned somewhere in the design, since both :py:data:`DomainPool.numpy_constraints` and - :data:`DomainPool.sequence_constraints` do not have access to the whole :any:`Design`. + assigned somewhere in the design, since both :py:data:`DomainPool.numpy_filters` and + :data:`DomainPool.sequence_filters` do not have access to the whole :any:`Design`. But the :any:`DomainPairConstraint` returned by :meth:`domains_not_substrings_of_each_other_constraint` can be used to specify this :any:`Design`-wide constraint. @@ -1159,8 +1172,8 @@ def generate_sequence(self, rng: np.random.Generator, previous_sequence: str | N picking a Hamming distance from :data:`DomainPool.hamming_probability` with weighted probabilities of choosing each distance. :return: - DNA sequence of given length satisfying :py:data:`DomainPool.numpy_constraints` and - :py:data:`DomainPool.sequence_constraints` + DNA sequence of given length satisfying :py:data:`DomainPool.numpy_filters` and + :py:data:`DomainPool.sequence_filters` """ if self.possible_sequences is not None: if isinstance(self.possible_sequences, list): @@ -1227,10 +1240,10 @@ def _sample_hamming_distance_from_sequence(self, previous_sequence: str, rng: np shuffle=True, num_random_seqs=num_to_generate, rng=rng) generated_all_seqs = False - seqs_satisfying_numpy_constraints = self._filter_numpy_constraints(seqs) - self._log_numpy_generation(length, num_to_generate, len(seqs_satisfying_numpy_constraints)) + seqs_satisfying_numpy_filters = self._apply_numpy_filters(seqs) + self._log_numpy_generation(length, num_to_generate, len(seqs_satisfying_numpy_filters)) sequence = self._first_sequence_satisfying_sequence_constraints( - seqs_satisfying_numpy_constraints) + seqs_satisfying_numpy_filters) if sequence is not None: return sequence @@ -1270,9 +1283,9 @@ def _get_next_sequence_satisfying_numpy_and_sequence_constraints(self, rng: np.r if num_to_generate >= num_sequences_total / 2: num_to_generate = num_sequences_total - seqs_satisfying_numpy_constraints = \ - self._generate_random_sequences_satisfying_numpy_constraints(rng, num_to_generate) - sequence = self._first_sequence_satisfying_sequence_constraints(seqs_satisfying_numpy_constraints) + seqs_satisfying_numpy_filters = \ + self._generate_random_sequences_passing_numpy_filters(rng, num_to_generate) + sequence = self._first_sequence_satisfying_sequence_constraints(seqs_satisfying_numpy_filters) if sequence is not None: return sequence @@ -1293,16 +1306,16 @@ def _get_next_sequence_satisfying_numpy_and_sequence_constraints(self, rng: np.r raise AssertionError('should be unreachable') - def _generate_random_sequences_satisfying_numpy_constraints(self, rng: np.random.Generator, - num_to_generate: int) -> nn.DNASeqList: + def _generate_random_sequences_passing_numpy_filters(self, rng: np.random.Generator, + num_to_generate: int) -> nn.DNASeqList: bases = self._bases_to_use() length = self.length _length_threshold_numpy = math.floor(math.log(num_to_generate, 4)) seqs = nn.DNASeqList(length=length, alphabet=bases, shuffle=True, num_random_seqs=num_to_generate, rng=rng) - seqs_satisfying_numpy_constraints = self._filter_numpy_constraints(seqs) - self._log_numpy_generation(length, num_to_generate, len(seqs_satisfying_numpy_constraints)) - return seqs_satisfying_numpy_constraints + seqs_passing_numpy_filters = self._apply_numpy_filters(seqs) + self._log_numpy_generation(length, num_to_generate, len(seqs_passing_numpy_filters)) + return seqs_passing_numpy_filters @staticmethod def _log_numpy_generation(length: int, num_to_generate: int, num_passed: int): @@ -1314,20 +1327,20 @@ def _log_numpy_generation(length: int, num_to_generate: int, num_passed: int): f'passed the numpy sequence constraints') def _bases_to_use(self) -> Collection[str]: - # checks explicitly for RestrictBasesConstraint - for constraint in self.numpy_constraints: - if isinstance(constraint, RestrictBasesConstraint): - return constraint.bases + # checks explicitly for RestrictBasesFilter + for filter in self.numpy_filters: + if isinstance(filter, RestrictBasesFilter): + return filter.bases return 'A', 'C', 'G', 'T' - def _filter_numpy_constraints(self, seqs: nn.DNASeqList) -> nn.DNASeqList: - # filter sequence not passing numpy constraints, but skip NumpyRestrictBasesConstraint since + def _apply_numpy_filters(self, seqs: nn.DNASeqList) -> nn.DNASeqList: + # filter sequence not passing numpy filters, but skip RestrictBasesFilter since # that is more efficiently handled by the DNASeqList constructor to generate the sequences # in the first place - for constraint in self.numpy_constraints: - if isinstance(constraint, RestrictBasesConstraint): + for filter in self.numpy_filters: + if isinstance(filter, RestrictBasesFilter): continue - seqs = constraint.remove_violating_sequences(seqs) + seqs = filter.remove_violating_sequences(seqs) return seqs @@ -2566,9 +2579,14 @@ def from_json_serializable(json_map: Dict[str, Any], label_json = json_map.get(label_key) label = label_decoder(label_json) + idt_json = json_map.get(idt_key) + idt = None + if idt_json is not None: + idt = IDTFields.from_json_serializable(idt_json) + strand: Strand[StrandLabel, DomainLabel] = Strand( domains=domains, starred_domain_indices=starred_domain_indices, - group=group, name=name, label=label) + group=group, name=name, label=label, idt=idt) return strand def __repr__(self) -> str: @@ -3457,7 +3475,9 @@ def to_idt_bulk_input_format(self, sc_design = _export_dummy_scadnano_design_for_idt_export(strands) return sc_design.to_idt_bulk_input_format(delimiter, key, warn_duplicate_name, only_strands_with_idt) - def write_idt_bulk_input_file(self, *, directory: str = '.', filename: str = None, + def write_idt_bulk_input_file(self, *, + filename: str = None, + directory: str = '.', key: KeyFunction[Strand] | None = None, extension: str | None = None, delimiter: str = ',', @@ -3476,11 +3496,11 @@ def write_idt_bulk_input_file(self, *, directory: str = '.', filename: str = Non The string written is that returned by :meth:`Design.to_idt_bulk_input_format`. + :param filename: + optional custom filename to use (instead of currently running script) :param directory: specifies a directory in which to place the file, either absolute or relative to the current working directory. Default is the current working directory. - :param filename: - optional custom filename to use (instead of currently running script) :param key: `key function `_ used to determine order in which to output strand sequences. Some useful defaults are provided by @@ -3513,7 +3533,9 @@ def write_idt_bulk_input_file(self, *, directory: str = '.', filename: str = Non extension = 'idt' sc.write_file_same_name_as_running_python_script(contents, extension, directory, filename) - def write_idt_plate_excel_file(self, *, directory: str = '.', filename: str = None, + def write_idt_plate_excel_file(self, *, + filename: str = None, + directory: str = '.', key: KeyFunction[Strand] | None = None, warn_duplicate_name: bool = False, only_strands_with_idt: bool = False, @@ -3536,11 +3558,11 @@ def write_idt_plate_excel_file(self, *, directory: str = '.', filename: str = No that number of strands, because IDT charges extra for a plate with too few strands: https://www.idtdna.com/pages/products/custom-dna-rna/dna-oligos/custom-dna-oligos + :param filename: + custom filename if default (explained above) is not desired :param directory: specifies a directory in which to place the file, either absolute or relative to the current working directory. Default is the current working directory. - :param filename: - custom filename if default (explained above) is not desired :param key: `key function `_ used to determine order in which to output strand sequences. Some useful defaults are provided by @@ -3949,26 +3971,26 @@ def assign_modifications_to_scadnano_design(self, sc_design: sc.Design[StrandLab # filter out ignored strands sc_strands_to_include = [strand for strand in sc_design.strands if strand not in ignored_strands] - dsd_strands_by_name = {strand.name: strand for strand in self.strands} + nuad_strands_by_name = {strand.name: strand for strand in self.strands} for sc_strand in sc_strands_to_include: - dsd_strand: Strand = dsd_strands_by_name[sc_strand.name] - if dsd_strand.modification_5p is not None: + nuad_strand: Strand = nuad_strands_by_name[sc_strand.name] + if nuad_strand.modification_5p is not None: if sc_strand.modification_5p is not None and not overwrite: raise ValueError(f'Cannot assign 5\' modification from dsd strand to scadnano strand ' f'{sc_strand.name} because the scadnano strand already has a 5\'' f'modification assigned:\n{sc_strand.modification_5p}. ' f'Set overwrite to True to force an overwrite.') - sc_strand.modification_5p = dsd_strand.modification_5p.to_scadnano_modification() + sc_strand.modification_5p = nuad_strand.modification_5p.to_scadnano_modification() - if dsd_strand.modification_3p is not None: + if nuad_strand.modification_3p is not None: if sc_strand.modification_3p is not None and not overwrite: raise ValueError(f'Cannot assign 3\' modification from dsd strand to scadnano strand ' f'{sc_strand.name} because the scadnano strand already has a 3\'' f'modification assigned:\n{sc_strand.modification_3p}. ' f'Set overwrite to True to force an overwrite.') - sc_strand.modification_3p = dsd_strand.modification_3p.to_scadnano_modification() + sc_strand.modification_3p = nuad_strand.modification_3p.to_scadnano_modification() - for offset, mod_int in dsd_strand.modifications_int.items(): + for offset, mod_int in nuad_strand.modifications_int.items(): if offset in sc_strand.modifications_int is not None and not overwrite: raise ValueError(f'Cannot assign internal modification from dsd strand to ' f'scadnano strand {sc_strand.name} at offset {offset} ' @@ -4164,7 +4186,7 @@ class Constraint(Generic[DesignPart], ABC): """ Abstract base class of all "soft" constraints to apply when running :meth:`search.search_for_dna_sequences`. - Unlike a :any:`NumpyConstraint` or a :any:`SequenceConstraint`, which disallow certain DNA sequences + Unlike a :any:`NumpyFilter` or a :any:`SequenceFilter`, which disallow certain DNA sequences from ever being assigned to a :any:`Domain`, a :any:`Constraint` can be violated during the search. The goal of the search is to reduce the number of violated :any:`Constraint`'s. See :meth:`search.search_for_dna_sequences` for a more detailed description of how the search algorithm diff --git a/nuad/search.py b/nuad/search.py index fb917e92..433f9776 100644 --- a/nuad/search.py +++ b/nuad/search.py @@ -860,8 +860,8 @@ def search_for_sequences(design: nc.Design, params: SearchParameters) -> None: Otherwise, if DNA sequences are already assigned to the :any:`Domain`'s initially, these sequences are used as a starting point for finding sequences that satisfy all :any:`Constraint`'s. - (In this case, those sequences are not checked against any :any:`NumpyConstraint`'s - or :any:`SequenceConstraint`'s in the :any:`Design`, since those checks are applied prior to + (In this case, those sequences are not checked against any :any:`NumpyFilter`'s + or :any:`SequenceFilter`'s in the :any:`Design`, since those checks are applied prior to assigning DNA sequences to any :any:`Domain`.) The function has some side effects. It writes a report on the optimal sequence assignment found so far diff --git a/tests/test.py b/tests/test.py index 2697b882..822b75e7 100644 --- a/tests/test.py +++ b/tests/test.py @@ -341,7 +341,7 @@ def test_write_idt_plate_excel_file(self) -> None: class TestNumpyConstraints(unittest.TestCase): def test_NearestNeighborEnergyConstraint_raises_exception_if_energies_in_wrong_order(self) -> None: with self.assertRaises(ValueError): - nc.NearestNeighborEnergyConstraint(-10, -15) + nc.NearestNeighborEnergyFilter(-10, -15) class TestInsertDomains(unittest.TestCase):