From 38a7b3cc53d6d65dbdfd65268ab07bd0951b446e Mon Sep 17 00:00:00 2001 From: Cameron Gilchrist Date: Wed, 4 Dec 2024 13:46:34 +0900 Subject: [PATCH] Squashed 'lib/foldseek/' changes from eec10926..33103374 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 33103374 Add parameter for taxonomy report in easy-search (#389) 3cad1360 Fix cluster reassign + tm-align #383 b43e63d7 Rework residue mapping to combine most gemmi AAs with prev FS AAs #387 8c3e3938 Merge pull request #385 from rachelse/master 7a3a9db3 Merge branch 'steineggerlab:master' into master 214886bb Update citations d295eec6 Replace rust with rustup in github CI 8485aaf9 Switch mac arm test to github hosted runner b011b8e4 lddt works when chaintm 0 d6056322 Fix convertalis for FS multimer d2d09b58 Merge pull request #330 from stromjm/interface_v2.0 4e514a2a Merge branch 'steineggerlab:master' into interface_v2.0 8daecacc except overlap e1f38a1e Merge pull request #362 from rachelse/master 6b00c5d9 Merge pull request #366 from rachelse/steineggerlab c18727e4 Deleted search-clust pipeline from README 3d85d5c4 minor d692f966 there could exist no match against itself e1f238df error a078fb9f typo in readme 24c8c92b treat monomer as singleton when scoremultimer uses --monomer-include-mode 1 1d5f9369 fix interface extraction exceptions c27a629a single chain alignment bug fixed 19c8820c Merge pull request #359 from steineggerlab/multimer 079a5a13 monomer related update done 06275df3 rollback to 43fd26f3d3e043c8f9fd4c2b193a8b68f8781689 4046f00f test rbh filter off 13e9883c Merge branch 'multimer' of https://github.com/steineggerlab/foldseek into multimer 2dadffc0 test rbh filter off 83cc643b update single chain cluster a17598cd build exceptions for interface mode 43fd26f3 remove tmscore threshold 704c3a82 fix chain cov ratio 1800e6a9 update for single chained alignments cd26d54c implement complex-tm-threshold 0b1fa423 typo c74a1a5f replace singlechain mode into monomer mode 34104148 merged steineggerlab/foldseek d414d908 merged steineggerlab/foldseek 232a4c43 Merge pull request #353 from rachelse/master 19595fd5 order in LocalParameters c5412cf7 Merge remote-tracking branch 'upstream/master' into interface_v2.0 c1a6b76c merged steineggerlab/foldseek 88093635 Merge pull request #354 from steineggerlab/test d267b3d8 bug fixed 7f2c6219 bug fix try4 abff375e bug fix try3 25e9629c bug fix try2 d9b2913b bug fix try1 8711f6b7 minor things 48666e18 typo in Readme 154019b6 Merge remote-tracking branch 'upstream/master' a2ec51d4 fix parameter explain 306ffb82 add parameter for single chained assignments b7c58acb Update regression 2c6b809d Update regression and convertalis ALNTMSCORE score 7e6be60a Revise alignment TMscore computation ab20120e scoremultimer 319144b4 minor 7112fccb minor e9b0f234 check alinged chain num when interfacelddt 2256e219 Merge pull request #345 from steineggerlab/foldseek_multiple_with_singlechain 9a2b7da4 bug fixed: single elemented vector for single chain alignment d99d79c1 single chain allowing multimersearch 52029c06 Added BFVD as a foldseek database (#344) af1e86e5 default parameters e1d15f64 alnLen seems much better 5b76247b minor ac0a32b1 minor 06016bdd minor fd87b160 added parameters in example in Readme 17402748 added coverage in Readme f1fc6a96 minor 1b39c482 big complex first to prevent big ones to be left and run alone for few hours using 1 thread 08b7e9ce previous version is twice faster de945b2b simd returns segfault 602ff37b Merge branch ‘steineggerlab:master’ into master be9fc339 Merge pull request #343 from steineggerlab/foldseek_multimer_bottleneck 3bcdabae checkChainRedundancy with unordered_sets 5a4ad0be foundNeighbors as an unordered_set b15e236a foldseek-multimer bottleneck solved b947f688 implement foundNeighbors 89f371bb DBSCAN as non-recursive function 9f603b29 check new scoremultimer 4f70b3f4 Readme 50c1df1b Readme 4f1592a4 Readme c3093cd3 outputs filtcov too 420038de chaging readme 3fd78777 chaging readme aa2ced33 all_seqs.fasta not working d1605fe7 SIMD for tmscore 72f5028c map to vector, complex to multimer, [TODO] check if speed improved 63bca7b2 remove distMap 73e41342 seq3di.clear() in GemmiWrapper 2758b96e Merge branch 'master' into interface_v2.0 9d74a1bf orders of Parameters 94874214 make mergeable 2b731d3c Updated regression 02fb1e58 Updated submodule 17986f4c code styles 6db582ea made filtermultimer to get one argument as output. 'output' and 'outout_info' will be the actual outputs a6bee293 important issue solved, thread_idx while writing 4bbdca4f minor aeacb68b changed way to buffer for ustring, tstring 05a80c58 filtcov.tsv to complex_filt_info file. createtsv query query complex_filt_info filtcov.tsv possible 51197117 minor 868bfb1c Update README.md 3ed737c0 Add --tmscore-threshold-mode to allow to switch normalization 552e18dc Fix convertalis and alignment normalization 6b77a4f6 code styles b40729c1 Fix issue https://github.com/steineggerlab/foldseek/issues/312 alntmscore is now normalized by the backtrace length 0415c37c minor issues edef0856 two db lines for each interface 9730f059 minor 2639127a Camel ce4528b1 Use MathUtil squaredist 69d397b2 consistency loss with multithreading solved a21576a0 mergeable, also only if at least 4 residues 0d82857c mergeable, only if at least 4 residues aefcffca Addition of interface code 6740f823 merged steineggerlab/foldseek 71b1f38f complex_h 56d3adbd monomer in scoremultimer aa30bec5 NogridInterface 928984bf Add BFMD database to repository bde99a74 Add ungappedprefilter to it. profile searches 16dc9150 complex-multimer DBSCAN earlystop with maxClusterNum 04876ca2 Merge commit '97d4c6cfb57bb7f0994015580579f31a18aaf9c5' 97d4c6cf Squashed 'lib/mmseqs/' changes from 804bb2af6d..ffb05619ca 0f6bb3cc Deleted original interfaceLDDT code file 22d24ffc Separated interface retrieving and saving 7b5e7287 Implemented interfaceLDDT but naive e478a324 Saved aligned coordinates into vector but cannot use SIMD operations 75013627 Sync with master branch c86d2ce3 solved complex_db_h for monomers 50208e9b Merged commit with review 3d26d2ee commit before pull 27756597 changed order of elements in struct and class for memory e35f355f setting default parameter collides with existing default values 543db3ad createtsv with --threads 1 to make complex_db_h in order a8f0a091 The mistake was not a big problem. One stage before putting iLDDT code 3fd0dab7 Corrected targetcomplexid mistake & chain number comparison 6a94924e Corrected mistake: Saved dbKey as target complex id so far.. 3123bae1 Removed redundant loops and improved performance cdf6e786 minor, input->query e44ea30e Made Complex struct and implementation is in progress 3922544e Inactivated filter-mode param: chainNum & conformation is affected 7e3e4764 Recovery point : saved previous iLDDT implementation b6943b8e Merge branch 'steineggerlab:master' into master 6494f8a6 minor bc212bc8 FoldseekBase.cpp update (#306) 3df6bc46 solved everything 7635ea3d minor a82587c6 minor c5f59d20 Merge branch 'steineggerlab:master' ee77f9d7 Merge 4604c238 complex to multimer 25812ffa Try moving to macos 11 in azure pipelines CI ebfdc666 Revert "Fix GCC 14 warnings" 044806f3 Fix GCC 14 warnings 59d2a253 Fix pymol mmcif files breaking gemmi (upstreamed here https://github.com/project-gemmi/gemmi/pull/325) e06bc508 octant 1da321c2 not done, but added vector check e8469df0 Update filtercomplex.cpp 5b10e67f Update filtercomplex.cpp cb0a43ec minor b31f2ada reset 8bc07703 reset cb277387 [MAYBE SOLVED} chainTM c411e323 DbKey to AlnId/DbId 01a39259 Check if no aligned chain exists 693d723d res.Len seems right 3855d2e8 Look at this. ChainTM goes higher than 1 a8f6588f simple 09b4e410 Solved Multithreading fe0c9383 [TODO] multithreading segfault e4abea41 Revert "maybe solved chain TM" 27f9ac86 maybe solved chain TM 0430e9e5 Calculate chain TM everytime a78bbd5d Set default param as set4final when computing chaintmscore e333ad48 Made few comments reviewing filtercomplex.cpp 8f2ab715 simplify building complex header cf28e076 parsing problem solved 55b5338c memcpy error solve 6f3ac2b6 parsing with pdb d02373d8 parsing 961b8cf0 removing extension a2ea4743 make filtcov.tsv not db 0bc9d97d minor f3a9c22b minor 36490d18 minor change 77936ab3 handling monomer & calculate chainTM if complexTM satisfied a0b426ef Merge branch 'steineggerlab:master' into master 79ad721c Solved weird chain TM-score behavior 81fbfd99 Implemented per-chain-tm but tmscore is suspicious e44034e6 Implemented realloc function in Coordinate.h 6ef9dc7b modified complex header 94da95c4 complex header make 5e47a2ac still 99251fd6 complexheader, but still issue exists 4e7c3624 Revised code of filter complex a27efde9 tmthreshold parameter 52f0459a TODO maybe TMthreshold c1294530 both tm for all cov modes 7367a247 assID, query, target, coverage(1 or 2), tm(1 or 2) ff5a8e5f filtercomplex tmp coverage.tsv 73c2aa7a Merge branch 'steineggerlab:master' into master 369842e2 Solved argument list too long issue 4329b254 Finalized rmdb 81ac5ef5 Generates comment about rep complex in fastafile b7a27454 easy-cc description b25de75f remove tmp files 4ceb9ada Completed to output rep seqs fasta file 560c6e49 temporary Result2repseq d16ac100 tmp remove 7e5bd089 changed tmp dir 86f5fe9e colsed easycc 412b51d0 header file fa875fdf making complex header file fe9865cb small changes 819a75a8 add description 06945aed Parameters 7d34bcbb default parameters e3defcd4 separated buildCmplDb from filtercomplex 83100b8e Solved complexsearch parameter not applied problem 607c14fc Success command run 9b183905 share status 7e054b6f [DONE] Build successed. [TODO] Default Parameter setting 6ac16224 finally make works c3a7c959 [TODO] Solve conflicts during make 8dd17b24 Organized shell scripts a36b8c7e git conflicts 6db40b57 tmp LocalParameters.cpp dd34d670 still build failed aebd3fd8 small changes b9d73150 .cpp files 02a89148 easycc and cc .sh dbd9b076 To Complexclusterworkflow f7b9508e Changes 03c635e5 Changed ComplexCluster into FilterComplex ec234b1d revised parameters for filtercomplex 39b2f062 renamed complexcluster.sh to filtercomplex.sh and finalized 47cfb386 share status 40a0e719 to share status 413faeeb Add filtercomplex parameter for coverage 8667be3d [TODO] Build failed. check localparameters, workflowfiles, etc. 3782b550 Made workflow file e15a2241 [IN PROGRESS] separated complexcluster and easycomplexcluster but need to organize 84c5279f FoldSeelBase.cpp should be changed though, easy-complexcluster output instruction 017ad0fe Updated LocalParameter files 83a46549 clustered results to flatfiles 38b60958 data/CMakeLists update 46611c48 CMakeLists update 51d29b8c Changed complexcluster.sh to easycomplexcluster.sh 69876616 Merge branch 'master' of https://github.com/rachelse/foldseek b5c45c37 minor modification ef00e785 [IN PROGRESS] Draft state complexcluster.sh 5ac175fd erased default -c 0.8 aaf1a6b1 complexclust.sh e52c527a cleaned code b7bc37fc -c default 0.8 d81811f9 TODO: select highest aligned alignments among same complex-complex & what if user wants to use -c 0.0? 5adeb999 no errors, not debugged yet 03860d19 has error, but for sharing status. Coverge criteria 47c37e08 Merge branch 'steineggerlab:master' into master 35c5914a Merge branch 'steineggerlab:master' into master bb7ec93b First version for complex filter git-subtree-dir: lib/foldseek git-subtree-split: 3310337471fc46880c245508af6a23adcb192cee --- .github/workflows/mac-arm64.yml | 8 +- README.md | 125 ++- azure-pipelines.yml | 8 +- data/CMakeLists.txt | 2 + data/easymultimercluster.sh | 163 ++++ data/easystructuresearch.sh | 6 + data/multimercluster.sh | 133 +++ data/structdatabases.sh | 17 + data/structureiterativesearch.sh | 10 +- lib/gemmi/mmcif.hpp | 7 +- lib/mmseqs/CMakeLists.txt | 1 + lib/mmseqs/azure-pipelines.yml | 2 +- lib/mmseqs/data/workflow/createtaxdb.sh | 17 +- lib/mmseqs/data/workflow/databases.sh | 23 +- lib/mmseqs/data/workflow/taxpercontig.sh | 6 +- lib/mmseqs/data/workflow/tsv2exprofiledb.sh | 32 +- lib/mmseqs/src/CMakeLists.txt | 7 +- lib/mmseqs/src/CommandDeclarations.h | 1 + lib/mmseqs/src/MMseqsBase.cpp | 10 +- lib/mmseqs/src/alignment/Alignment.cpp | 2 +- lib/mmseqs/src/commons/MathUtil.h | 7 + lib/mmseqs/src/commons/MultiParam.h | 16 +- lib/mmseqs/src/commons/Parameters.cpp | 4 +- .../prefiltering/CacheFriendlyOperations.cpp | 2 +- lib/mmseqs/src/util/CMakeLists.txt | 1 + lib/mmseqs/src/util/convertalignments.cpp | 65 +- lib/mmseqs/src/util/createclusterdb.cpp | 2 +- lib/mmseqs/src/util/recoverlongestorf.cpp | 148 ++++ lib/mmseqs/src/util/summarizeresult.cpp | 1 + lib/mmseqs/src/util/tsv2exprofiledb.cpp | 6 + lib/mmseqs/src/workflow/Taxonomy.cpp | 1 + lib/mmseqs/util/regression | 2 +- lib/tmalign/basic_fun.h | 1 - regression | 2 +- src/FoldseekBase.cpp | 90 +- src/LocalCommandDeclarations.h | 3 + src/commons/LocalParameters.cpp | 52 +- src/commons/LocalParameters.h | 30 + src/commons/TMaligner.cpp | 16 +- src/commons/TMaligner.h | 2 + src/strucclustutils/CMakeLists.txt | 1 + src/strucclustutils/GemmiWrapper.cpp | 236 ++++-- src/strucclustutils/GemmiWrapper.h | 2 +- src/strucclustutils/MultimerUtil.h | 42 +- src/strucclustutils/aln2tmscore.cpp | 5 +- src/strucclustutils/createmultimerreport.cpp | 7 +- src/strucclustutils/filtermultimer.cpp | 787 ++++++++++++++++++ src/strucclustutils/scoremultimer.cpp | 232 +++--- src/strucclustutils/structcreatedb.cpp | 445 ++++++++-- src/strucclustutils/structurealign.cpp | 3 +- src/strucclustutils/structureconvertalis.cpp | 19 +- .../structurerescorediagonal.cpp | 2 +- src/workflow/CMakeLists.txt | 2 + src/workflow/EasyMultimerCluster.cpp | 77 ++ src/workflow/EasyStructureSearch.cpp | 10 + src/workflow/MultimerCluster.cpp | 63 ++ src/workflow/StructureCluster.cpp | 6 +- src/workflow/StructureSearch.cpp | 2 + 58 files changed, 2586 insertions(+), 386 deletions(-) create mode 100644 data/easymultimercluster.sh create mode 100644 data/multimercluster.sh create mode 100644 lib/mmseqs/src/util/recoverlongestorf.cpp create mode 100644 src/strucclustutils/filtermultimer.cpp create mode 100644 src/workflow/EasyMultimerCluster.cpp create mode 100644 src/workflow/MultimerCluster.cpp diff --git a/.github/workflows/mac-arm64.yml b/.github/workflows/mac-arm64.yml index 4c02e26..38e9ee3 100644 --- a/.github/workflows/mac-arm64.yml +++ b/.github/workflows/mac-arm64.yml @@ -8,16 +8,20 @@ on: jobs: build: - runs-on: [self-hosted, macOS, ARM64] + runs-on: macos-latest steps: - uses: actions/checkout@v3 with: submodules: true + - name: Dependencies + run: | + brew install -f --overwrite cmake libomp rustup + rustup-init --profile minimal -q -y + - name: Build run: | mkdir -p build - rustup update cd build LIBOMP=$(brew --prefix libomp) cmake \ diff --git a/README.md b/README.md index 66e1de4..64c6aad 100644 --- a/README.md +++ b/README.md @@ -14,26 +14,45 @@ Foldseek enables fast and sensitive comparisons of large protein structure sets. # Table of Contents - [Foldseek](#foldseek) -- [Webserver](#webserver) -- [Installation](#installation) -- [Memory requirements](#memory-requirements) -- [Tutorial Video](#tutorial-video) -- [Documentation](#documentation) -- [Quick Start](#quick-start) - - [Search](#search) - - [Output](#output-search) - - [Important Parameters](#important-search-parameters) - - [Alignment Mode](#alignment-mode) - - [Structure search from FASTA input](#structure-search-from-fasta-input) - - [Databases](#databases) - - [Create Custom Databases and Indexes](#create-custom-databases-and-indexes) - - [Cluster](#cluster) - - [Output](#output-cluster) - - [Important Parameters](#important-cluster-parameters) - - [Multimer](#multimersearch) - - [Output](#multimer-search-output) -- [Main Modules](#main-modules) -- [Examples](#examples) + - [Publications](#publications) +- [Table of Contents](#table-of-contents) + - [Webserver](#webserver) + - [Installation](#installation) + - [Memory requirements](#memory-requirements) + - [Tutorial Video](#tutorial-video) + - [Documentation](#documentation) + - [Quick start](#quick-start) + - [Search](#search) + - [Output Search](#output-search) + - [Tab-separated](#tab-separated) + - [Superpositioned Cα only PDB files](#superpositioned-cα-only-pdb-files) + - [Interactive HTML](#interactive-html) + - [Important search parameters](#important-search-parameters) + - [Alignment Mode](#alignment-mode) + - [Structure search from FASTA input](#structure-search-from-fasta-input) + - [Databases](#databases) + - [Create custom databases and indexes](#create-custom-databases-and-indexes) + - [Cluster](#cluster) + - [Output Cluster](#output-cluster) + - [Tab-separated cluster](#tab-separated-cluster) + - [Representative fasta](#representative-fasta) + - [All member fasta](#all-member-fasta) + - [Important cluster parameters](#important-cluster-parameters) + - [Multimersearch](#multimersearch) + - [Using Multimersearch](#using-multimersearch) + - [Multimer Search Output](#multimer-search-output) + - [Tab-separated-complex](#tab-separated-complex) + - [Complex Report](#complex-report) + - [Multimercluster](#multimercluster) + - [Output MultimerCluster](#output-multimercluster) + - [Tab-separated multimercluster](#tab-separated-multimercluster) + - [Representative multimer fasta](#representative-multimer-fasta) + - [Filtered search result](#filtered-search-result) + - [Important multimer cluster parameters](#important-multimer-cluster-parameters) + - [Main Modules](#main-modules) + - [Examples](#examples) + - [Rescore aligments using TMscore](#rescore-aligments-using-tmscore) + - [Query centered multiple sequence alignment](#query-centered-multiple-sequence-alignment) ## Webserver Search your protein structures against the [AlphaFoldDB](https://alphafold.ebi.ac.uk/) and [PDB](https://www.rcsb.org/) in seconds using the Foldseek webserver ([code](https://github.com/soedinglab/mmseqs2-app)): [search.foldseek.com](https://search.foldseek.com) 🚀 @@ -238,6 +257,7 @@ MCAR...Q | --cov-mode | Alignment | 0: coverage of query and target, 1: coverage of target, 2: coverage of query | | --min-seq-id | Alignment | the minimum sequence identity to be clustered | | --tmscore-threshold | Alignment | accept alignments with an alignment TMscore > thr | +| --tmscore-threshold-mode | Alignment | normalize TMscore by 0: alignment, 1: representative, 2: member length | | --lddt-threshold | Alignment | accept alignments with an alignment LDDT score > thr | @@ -300,9 +320,64 @@ The default output fields are: `query,target,fident,alnlen,mismatch,gapopen,qsta 1tim.pdb.gz 8tim.pdb.gz A,B A,B 0.98941 0.98941 0.999983,0.000332,0.005813,-0.000373,0.999976,0.006884,-0.005811,-0.006886,0.999959 0.298992,0.060047,0.565875 0 ``` +### Multimercluster +The `easy-multimercluster` module is designed for multimer-level structural clustering(supported input formats: PDB/mmCIF, flat or gzipped). By default, easy-multimercluster generates three output files with the following prefixes: (1) `_cluster.tsv`, (2) `_rep_seq.fasta` and (3) `_cluster_report`. The first file (1) is a [tab-separated](#tab-separated-multimercluster) file describing the mapping from representative multimer to member, while the second file (2) contains only [representative sequences](#representative-multimer-fasta). The third file (3) is also a [tab-separated](#filtered-search-result) file describing filtered alignments. + +Make sure chain names in PDB/mmcIF files does not contain underscores(_). + + foldseek easy-multimercluster example/ clu tmp --multimer-tm-threshold 0.65 --chain-tm-threshold 0.5 --interface-lddt-threshold 0.65 + +#### Output MultimerCluster +##### Tab-separated multimercluster +``` +5o002 5o002 +194l2 194l2 +194l2 193l2 +10mh121 10mh121 +10mh121 10mh114 +10mh121 10mh119 +``` +##### Representative multimer fasta +``` +#5o002 +>5o002_A +SHGK...R +>5o002_B +SHGK...R +#194l2 +>194l2_A0 +KVFG...L +>194l2_A6 +KVFG...L +#10mh121 +... +``` +##### Filtered search result +The `_cluster_report` contains `qcoverage, tcoverage, multimer qTm, multimer tTm, interface lddt, ustring, tstring` of alignments after filtering and before clustering. +``` +5o0f2 5o0f2 1.000 1.000 1.000 1.000 1.000 1.000,0.000,0.000,0.000,1.000,0.000,0.000,0.000,1.000 0.000,0.000,0.000 +5o0f2 5o0d2 1.000 1.000 0.999 0.992 1.000 0.999,0.000,-0.000,-0.000,0.999,-0.000,0.000,0.000,0.999 -0.004,-0.001,0.084 +5o0f2 5o082 1.000 0.990 0.978 0.962 0.921 0.999,-0.025,-0.002,0.025,0.999,-0.001,0.002,0.001,0.999 -0.039,0.000,-0.253 +``` +The query and target coverages here represent the sum of the coverages of all aligned chains, divided by the total query and target multimer length respectively. + +#### Important multimer cluster parameters + +| Option | Category | Description | +|-------------------|-----------------|-----------------------------------------------------------------------------------------------------------| +| -e | Sensitivity | List matches below this E-value (range 0.0-inf, default: 0.001); increasing it reports more distant structures | +| --alignment-type| Alignment | 0: 3Di Gotoh-Smith-Waterman (local, not recommended), 1: TMalign (global, slow), 2: 3Di+AA Gotoh-Smith-Waterman (local, default) | +| -c | Alignment | List matches above this fraction of aligned (covered) residues (see --cov-mode) (default: 0.0); higher coverage = more global alignment | +| --cov-mode | Alignment | 0: coverage of query and target (cluster multimers only with same chain numbers), 1: coverage of target, 2: coverage of query | +| --multimer-tm-threshold | Alignment | accept alignments with multimer alignment TMscore > thr | +| --chain-tm-threshold | Alignment | accept alignments if every single chain TMscore > thr | +| --interface-lddt-threshold | Alignment | accept alignments with an interface LDDT score > thr | + ## Main Modules - `easy-search` fast protein structure search - `easy-cluster` fast protein structure clustering +- `easy-multimersearch` fast protein multimer-level structure search +- `easy-multimercluster` fast protein multimer-level structure clustering - `createdb` create a database from protein structures (PDB,mmCIF, mmJSON) - `databases` download pre-assembled databases @@ -324,16 +399,6 @@ foldseek createtsv queryDB targetDB aln_tmscore aln_tmscore.tsv Output format `aln_tmscore.tsv`: query and target identifiers, TMscore, translation(3) and rotation vector=(3x3) -### Cluster search results -The following command performs an all-against-all alignments of the input structures and retains only the alignments, which cover 80% of the sequence (-c 0.8) (read more about alignment coverage options [here](https://github.com/soedinglab/MMseqs2/wiki#how-to-set-the-right-alignment-coverage-to-cluster)). It then clusters the results using a greedy set cover algorithm. The clustering mode can be adjusted using --cluster-mode, read more [here](https://github.com/soedinglab/MMseqs2/wiki#clustering-modes). The clustering output format is described [here](https://github.com/soedinglab/MMseqs2/wiki#cluster-tsv-format). - -``` -foldseek createdb example/ db -foldseek search db db aln tmpFolder -c 0.8 -foldseek clust db aln clu -foldseek createtsv db db clu clu.tsv -``` - ### Query centered multiple sequence alignment Foldseek can output multiple sequence alignments in a3m format using the following commands. To convert a3m to FASTA format, the following script can be used [reformat.pl](https://raw.githubusercontent.com/soedinglab/hh-suite/master/scripts/reformat.pl) (`reformat.pl in.a3m out.fas`). diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 15b105f..1a6d584 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -121,10 +121,10 @@ jobs: targetPath: $(Build.SourcesDirectory)/build/src/foldseek artifactName: foldseek-linux-$(SIMD) - - job: build_macos_11 - displayName: macOS 11 + - job: build_macos + displayName: macOS pool: - vmImage: 'macos-11' + vmImage: 'macos-12' steps: - checkout: self submodules: true @@ -153,7 +153,7 @@ jobs: pool: vmImage: 'ubuntu-latest' dependsOn: - - build_macos_11 + - build_macos - build_ubuntu_2004 - build_ubuntu_cross_2004 steps: diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt index e51edd1..1c7b062 100644 --- a/data/CMakeLists.txt +++ b/data/CMakeLists.txt @@ -15,6 +15,8 @@ set(COMPILED_RESOURCES vendor.js.zst multimersearch.sh easymultimersearch.sh + multimercluster.sh + easymultimercluster.sh ) set(GENERATED_OUTPUT_HEADERS "") diff --git a/data/easymultimercluster.sh b/data/easymultimercluster.sh new file mode 100644 index 0000000..eb5b13d --- /dev/null +++ b/data/easymultimercluster.sh @@ -0,0 +1,163 @@ +#!/bin/sh -e +fail() { + echo "Error: $1" + exit 1 +} + +notExists() { + [ ! -f "$1" ] +} + +exists() { + [ -f "$1" ] +} + +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + +mapCmplName2ChainKeys() { + awk -F"\t" 'FNR==1 {++fIndex} + fIndex==1 { + repName[$1]=1 + if (match($1, /MODEL/)){ + tmpName[$1]=1 + }else{ + tmpName[$1"_MODEL_1"]=1 + } + next + } + fIndex==2{ + if (match($2, /MODEL/)){ + if ($2 in tmpName){ + repId[$1]=1 + }else{ + ho[1]=1 + } + }else{ + if ($2 in repName){ + repId[$1]=1 + } + } + next + } + { + if ($3 in repId){ + print $1 + } + } + ' "${1}" "${2}.source" "${2}.lookup" > "${3}" +} + +postprocessFasta() { + awk ' BEGIN {FS=">"} + $0 ~/^>/ { + # match($2, /(.*).pdb*/) + split($2,parts,"_") + complex="" + for (j = 1; j < length(parts); j++) { + complex = complex parts[j] + if (j < length(parts)-1){ + complex=complex"_" + } + } + if (!(complex in repComplex)) { + print "#"complex + repComplex[complex] = "" + } + } + {print $0} + ' "${1}" > "${1}.tmp" && mv "${1}.tmp" "${1}" +} + +if notExists "${TMP_PATH}/query.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/query" ${CREATEDB_PAR} \ + || fail "query createdb died" +fi + +if notExists "${TMP_PATH}/multimer_clu.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" multimercluster "${TMP_PATH}/query" "${TMP_PATH}/multimer_clu" "${TMP_PATH}" ${MULTIMERCLUSTER_PAR} \ + || fail "Multimercluster died" +fi + +SOURCE="${TMP_PATH}/query" +INPUT="${TMP_PATH}/latest/multimer_db" +if notExists "${TMP_PATH}/cluster.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ + || fail "Convert Alignments died" + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clu_filt_info" "${TMP_PATH}/cluster_report" ${THREADS_PAR} \ + || fail "Convert Alignments died" +fi + +if notExists "${TMP_PATH}/multimer_rep_seqs.dbtype"; then + mapCmplName2ChainKeys "${TMP_PATH}/cluster.tsv" "${SOURCE}" "${TMP_PATH}/rep_seqs.list" + # shellcheck disable=SC2086 + "$MMSEQS" createsubdb "${TMP_PATH}/rep_seqs.list" "${SOURCE}" "${TMP_PATH}/multimer_rep_seqs" ${CREATESUBDB_PAR} \ + || fail "createsubdb died" +fi + +if notExists "${TMP_PATH}/multimer_rep_seq.fasta"; then + # shellcheck disable=SC2086 + "$MMSEQS" result2flat "${SOURCE}" "${SOURCE}" "${TMP_PATH}/multimer_rep_seqs" "${TMP_PATH}/multimer_rep_seq.fasta" ${VERBOSITY_PAR} \ + || fail "result2flat died" + postprocessFasta "${TMP_PATH}/multimer_rep_seq.fasta" +fi + +#TODO: generate fasta file for all sequences +# if notExists "${TMP_PATH}/multimer_all_seqs.fasta"; then +# # shellcheck disable=SC2086 +# "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/multimer_clu" "${TMP_PATH}/multimer_clust_seqs" ${THREADS_PAR} \ +# || fail "Result2repseq died" + +# # shellcheck disable=SC2086 +# "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clust_seqs" "${TMP_PATH}/multimer_all_seqs.fasta" ${VERBOSITY_PAR} \ +# || fail "result2flat died" +# fi + +# mv "${TMP_PATH}/multimer_all_seqs.fasta" "${RESULT}_all_seqs.fasta" +mv "${TMP_PATH}/multimer_rep_seq.fasta" "${RESULT}_rep_seq.fasta" +mv "${TMP_PATH}/cluster.tsv" "${RESULT}_cluster.tsv" +mv "${TMP_PATH}/cluster_report" "${RESULT}_cluster_report" + +if [ -n "${REMOVE_TMP}" ]; then + rm "${INPUT}.0" + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/multimer_db" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + # "$MMSEQS" rmdb "${TMP_PATH}/multimer_clu_seqs" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/multimer_rep_seqs" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/multimer_rep_seqs_h" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/query" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/query_h" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${INPUT}" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${INPUT}_h" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/query_ca" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/query_ss" ${VERBOSITY_PAR} + rm "${TMP_PATH}/rep_seqs.list" + rm -rf "${TMP_PATH}/latest" + rm -f "${TMP_PATH}/easymultimercluster.sh" +fi \ No newline at end of file diff --git a/data/easystructuresearch.sh b/data/easystructuresearch.sh index ba482f0..30a06e7 100644 --- a/data/easystructuresearch.sh +++ b/data/easystructuresearch.sh @@ -51,6 +51,12 @@ if notExists "${TMP_PATH}/alis.dbtype"; then || fail "Convert Alignments died" fi +if [ -n "${TAXONOMY}" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" taxonomyreport "${TARGET}${INDEXEXT}" "${INTERMEDIATE}" "${RESULTS}_report" ${TAXONOMYREPORT_PAR} \ + || fail "taxonomyreport died" +fi + if [ -n "${REMOVE_TMP}" ]; then if [ -n "${GREEDY_BEST_HITS}" ]; then # shellcheck disable=SC2086 diff --git a/data/multimercluster.sh b/data/multimercluster.sh new file mode 100644 index 0000000..8379968 --- /dev/null +++ b/data/multimercluster.sh @@ -0,0 +1,133 @@ +#!/bin/sh -e +fail() { + echo "Error: $1" + exit 1 +} + +notExists() { + [ ! -f "$1" ] +} + +exists() { + [ -f "$1" ] +} + +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + +# Shift initial DB to complexDB using soft-linking +# $1: input db +# $2: output db +buildCmplDb() { + touch "${2}" + awk -F"\t" 'BEGIN {OFFSET=0} + FNR==NR{chain_len[$1]=$3;next} + { + if (!($3 in off_arr)) { + off_arr[$3]=OFFSET + } + cmpl_len[$3]+=chain_len[$1];OFFSET+=chain_len[$1] + } + END { + for (cmpl in off_arr) { + print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] + } + }' "${1}.index" "${1}.lookup" > "${2}.index" + ln -s "$(abspath "${1}")" "${2}.0" + cp "${1}.dbtype" "${2}.dbtype" +} + +buldCmplhDb(){ + awk -F"\t" 'BEGIN {INDEXVAL=0} + { + split($2,words," ") + split(words[1],parts,"_") + output_string=parts[1] + for (j = 2; j < length(parts); j++) { + if (j < length(parts)){ + output_string=output_string"_" + } + output_string = output_string parts[j] + } + headerstring="" + for (k = 2; k < length(words)+1; k++) { + headerstring = headerstring words[k]" " + } + if (!(output_string in gogo)){ + print INDEXVAL"\t"output_string" "headerstring + INDEXVAL++ + } + gogo[output_string]=1 + + }' "${1}" > "${2}" +} + + +# [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}"; + +if notExists "${TMP_PATH}/multimer_result.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" multimersearch "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_result" "${TMP_PATH}/multimersearch_tmp" ${MULTIMERSEARCH_PAR} \ + || fail "multimerSearch died" +fi + +if notExists "multimer_filt.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" filtermultimer "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_result" "${TMP_PATH}/multimer_filt" ${FILTERMULTIMER_PAR} \ + || fail "FilterMultimer died" +fi + +# shift query DB, .index, .dbtype +if notExists "${TMP_PATH}/multimer_db.dbtype"; then + # build complex db as output + buildCmplDb "${INPUT}" "${TMP_PATH}/multimer_db" +fi + +# Shift _h, _h.dbtype +if notExists "${TMP_PATH}/multimer_db_h.dbtype"; then + # # shellcheck disable=SC2086 + # "$MMSEQS" tsv2db "${INPUT}.source" "${TMP_PATH}/complex_db_header_tmp" ${VERBOSITY_PAR} \ + # || fail "tsv2db died" + # shellcheck disable=SC2086 + # "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h.tsv" ${VERBOSITY_PAR} \ + "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h.tsv" --threads 1 \ + || fail "createtsv died" + buldCmplhDb "${TMP_PATH}/chain_db_h.tsv" "${TMP_PATH}/multimer_header.tsv" + # shellcheck disable=SC2086 + "$MMSEQS" tsv2db "${TMP_PATH}/multimer_header.tsv" "${TMP_PATH}/multimer_db_h" ${VERBOSITY_PAR} \ + || fail "tsv2db died" +fi + +COMP="${TMP_PATH}/multimer_db" + +if notExists "${RESULT}.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" clust "${COMP}" "${TMP_PATH}/multimer_filt" "${RESULT}" ${CLUSTER_PAR} \ + || fail "Clustering died" + # shellcheck disable=SC2086 + "$MMSEQS" mvdb "${TMP_PATH}/multimer_filt_info" "${RESULT}_filt_info" \ + || fail "mv died" + +fi + +if [ -n "${REMOVE_TMP}" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/multimer_filt" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/multimer_result" ${VERBOSITY_PAR} + rm "${TMP_PATH}/chain_db_h.tsv" + rm "${TMP_PATH}/multimer_header.tsv" + rm -rf "${TMP_PATH}/multimersearch_tmp" + rm -f "${TMP_PATH}/multimercluster.sh" +fi \ No newline at end of file diff --git a/data/structdatabases.sh b/data/structdatabases.sh index 9f0b948..d10aac3 100644 --- a/data/structdatabases.sh +++ b/data/structdatabases.sh @@ -159,6 +159,15 @@ case "${SELECTION}" in push_back "${TMP_PATH}/cath50" INPUT_TYPE="FOLDSEEK_DB" ;; + "BFMD") + if notExists "${TMP_PATH}/bfmd.tar.gz"; then + downloadFile "https://foldseek.steineggerlab.workers.dev/bfmd.tar.gz" "${TMP_PATH}/bfmd.tar.gz" + downloadFile "https://foldseek.steineggerlab.workers.dev/bfmd.version" "${TMP_PATH}/version" + fi + tar xvfz "${TMP_PATH}/bfmd.tar.gz" -C "${TMP_PATH}" + push_back "${TMP_PATH}/bfmd" + INPUT_TYPE="FOLDSEEK_DB" + ;; "ProstT5") MODEL=prostt5-f16-safetensors.tar.gz if [ -n "${PROSTT5_QUANTIZED}" ]; then @@ -173,6 +182,14 @@ case "${SELECTION}" in tar xvfz "${TMP_PATH}/${MODEL}" -C "${OUTDB}" INPUT_TYPE="MODEL_WEIGHTS" ;; + "BFVD") + if notExists "${TMP_PATH}/bfvd.tar.gz"; then + downloadFile "https://bfvd.steineggerlab.workers.dev/bfvd_foldseekdb.tar.gz" "${TMP_PATH}/bfvd.tar.gz" + downloadFile "https://bfvd.steineggerlab.workers.dev/bfvd.version" "${TMP_PATH}/version" + fi + tar xvfz "${TMP_PATH}/bfvd.tar.gz" -C "${TMP_PATH}" + push_back "${TMP_PATH}/bfvd" + INPUT_TYPE="FOLDSEEK_DB" esac if notExists "${OUTDB}.dbtype"; then diff --git a/data/structureiterativesearch.sh b/data/structureiterativesearch.sh index 9a59a61..8cfe217 100644 --- a/data/structureiterativesearch.sh +++ b/data/structureiterativesearch.sh @@ -19,13 +19,19 @@ while [ "$STEP" -lt "$NUM_IT" ]; do if notExists "$TMP_PATH/pref_tmp_${STEP}.done"; then PARAM="PREFILTER_PAR_$STEP" eval TMP="\$$PARAM" + TOOL="prefilter" + if [ "$PREFMODE" = "UNGAPPED" ]; then + TOOL="ungappedprefilter" + PARAM="UNGAPPEDPREFILTER_PAR_$STEP" + eval TMP="\$$PARAM" + fi if [ $STEP -eq 0 ]; then # shellcheck disable=SC2086 - $RUNNER "$MMSEQS" prefilter "${QUERYDB}_ss" "${TARGET_PREFILTER}${INDEXEXT}" "$TMP_PATH/pref_${STEP}" ${TMP} \ + $RUNNER "$MMSEQS" $TOOL "${QUERYDB}_ss" "${TARGET_PREFILTER}${INDEXEXT}" "$TMP_PATH/pref_${STEP}" ${TMP} \ || fail "Prefilter died" else # shellcheck disable=SC2086 - $RUNNER "$MMSEQS" prefilter "${QUERYDB}_ss" "${TARGET_PREFILTER}${INDEXEXT}" "$TMP_PATH/pref_tmp_${STEP}" ${TMP} \ + $RUNNER "$MMSEQS" $TOOL "${QUERYDB}_ss" "${TARGET_PREFILTER}${INDEXEXT}" "$TMP_PATH/pref_tmp_${STEP}" ${TMP} \ || fail "Prefilter died" fi touch "$TMP_PATH/pref_tmp_${STEP}.done" diff --git a/lib/gemmi/mmcif.hpp b/lib/gemmi/mmcif.hpp index fbe6037..31a56be 100644 --- a/lib/gemmi/mmcif.hpp +++ b/lib/gemmi/mmcif.hpp @@ -579,7 +579,7 @@ inline Structure make_structure_from_block(const cif::Block& block_) { "occupancy", "B_iso_or_equiv", "?pdbx_formal_charge", - "auth_seq_id", + "?auth_seq_id", "?auth_comp_id", "?auth_asym_id", "?auth_atom_id", @@ -592,10 +592,13 @@ inline Structure make_structure_from_block(const cif::Block& block_) { // we use only one comp (residue) and one atom name const int kCompId = atom_table.first_of(kAuthCompId, kLabelCompId); const int kAtomId = atom_table.first_of(kAuthAtomId, kLabelAtomId); + const int kSeqId = atom_table.first_of(kAuthSeqId, kLabelSeqId); if (!atom_table.has_column(kCompId)) fail("Neither _atom_site.label_comp_id nor auth_comp_id found"); if (!atom_table.has_column(kAtomId)) fail("Neither _atom_site.label_atom_id nor auth_atom_id found"); + if (!atom_table.has_column(kSeqId)) + fail("Neither _atom_site.label_seq_id nor auth_seq_id found"); Model *model = nullptr; Chain *chain = nullptr; @@ -615,7 +618,7 @@ inline Structure make_structure_from_block(const cif::Block& block_) { resi = nullptr; } ResidueId rid = make_resid(cif::as_string(row[kCompId]), - cif::as_string(row[kAuthSeqId]), + cif::as_string(row[kSeqId]), row.has(kInsCode) ? &row[kInsCode] : nullptr); if (!resi || !resi->matches(rid)) { resi = chain->find_or_add_residue(rid); diff --git a/lib/mmseqs/CMakeLists.txt b/lib/mmseqs/CMakeLists.txt index 3b67bb8..adde426 100644 --- a/lib/mmseqs/CMakeLists.txt +++ b/lib/mmseqs/CMakeLists.txt @@ -117,6 +117,7 @@ if (NATIVE_ARCH AND (MMSEQS_ARCH STREQUAL "")) set(MMSEQS_ARCH "-march=native") endif () endif () +set(MMSEQS_ARCH ${MMSEQS_ARCH} CACHE INTERNAL "") if (NOT (MMSEQS_ARCH STREQUAL "")) set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} ${MMSEQS_ARCH}") diff --git a/lib/mmseqs/azure-pipelines.yml b/lib/mmseqs/azure-pipelines.yml index 93df1e9..798b538 100644 --- a/lib/mmseqs/azure-pipelines.yml +++ b/lib/mmseqs/azure-pipelines.yml @@ -40,7 +40,7 @@ jobs: - job: build_macos displayName: macOS pool: - vmImage: 'macos-11' + vmImage: 'macos-12' steps: - checkout: self submodules: true diff --git a/lib/mmseqs/data/workflow/createtaxdb.sh b/lib/mmseqs/data/workflow/createtaxdb.sh index c1adce1..7d81237 100755 --- a/lib/mmseqs/data/workflow/createtaxdb.sh +++ b/lib/mmseqs/data/workflow/createtaxdb.sh @@ -40,13 +40,22 @@ downloadFile() { ARIA) FILENAME=$(basename "${OUTPUT}") DIR=$(dirname "${OUTPUT}") - aria2c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && return 0 + if aria2c -c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "${FILENAME}.aria2" -d "$DIR" "$URL"; then + mv -f -- "${OUTPUT}.aria2" "${OUTPUT}" + return 0 + fi ;; CURL) - curl -o "$OUTPUT" "$URL" && return 0 + if curl -L -C - -o "${OUTPUT}.curl" "$URL"; then + mv -f -- "${OUTPUT}.curl" "${OUTPUT}" + return 0 + fi ;; WGET) - wget -O "$OUTPUT" "$URL" && return 0 + if wget -O "${OUTPUT}.wget" -c "$URL"; then + mv -f -- "${OUTPUT}.wget" "${OUTPUT}" + return 0 + fi ;; esac done @@ -59,7 +68,7 @@ if { [ "${DBMODE}" = "1" ] && notExists "${TAXDBNAME}_taxonomy"; } || { [ "${DBM # Download NCBI taxon information if notExists "${TMP_PATH}/ncbi_download.complete"; then echo "Download taxdump.tar.gz" - downloadFile "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" "${TMP_PATH}/taxdump.tar.gz" + downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz" "${TMP_PATH}/taxdump.tar.gz" tar -C "${TMP_PATH}" -xzf "${TMP_PATH}/taxdump.tar.gz" names.dmp nodes.dmp merged.dmp delnodes.dmp touch "${TMP_PATH}/ncbi_download.complete" rm -f "${TMP_PATH}/taxdump.tar.gz" diff --git a/lib/mmseqs/data/workflow/databases.sh b/lib/mmseqs/data/workflow/databases.sh index ce4f3cc..0e4df56 100644 --- a/lib/mmseqs/data/workflow/databases.sh +++ b/lib/mmseqs/data/workflow/databases.sh @@ -40,13 +40,22 @@ downloadFile() { ARIA) FILENAME=$(basename "${OUTPUT}") DIR=$(dirname "${OUTPUT}") - aria2c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && return 0 + if aria2c -c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "${FILENAME}.aria2" -d "$DIR" "$URL"; then + mv -f -- "${OUTPUT}.aria2" "${OUTPUT}" + return 0 + fi ;; CURL) - curl -L -o "$OUTPUT" "$URL" && return 0 + if curl -L -C - -o "${OUTPUT}.curl" "$URL"; then + mv -f -- "${OUTPUT}.curl" "${OUTPUT}" + return 0 + fi ;; WGET) - wget -O "$OUTPUT" "$URL" && return 0 + if wget -O "${OUTPUT}.wget" -c "$URL"; then + mv -f -- "${OUTPUT}.wget" "${OUTPUT}" + return 0 + fi ;; esac done @@ -118,9 +127,9 @@ case "${SELECTION}" in if notExists "${TMP_PATH}/nr.gz"; then date "+%s" > "${TMP_PATH}/version" downloadFile "https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz" "${TMP_PATH}/nr.gz" - downloadFile "https://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz" "${TMP_PATH}/prot.accession2taxid.gz" + downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz" "${TMP_PATH}/prot.accession2taxid.gz" gunzip "${TMP_PATH}/prot.accession2taxid.gz" - downloadFile "https://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/pdb.accession2taxid.gz" "${TMP_PATH}/pdb.accession2taxid.gz" + downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/pdb.accession2taxid.gz" "${TMP_PATH}/pdb.accession2taxid.gz" gunzip "${TMP_PATH}/pdb.accession2taxid.gz" fi push_back "${TMP_PATH}/nr.gz" @@ -212,8 +221,8 @@ case "${SELECTION}" in ;; "CDD") if notExists "${TMP_PATH}/msa.msa.gz"; then - downloadFile "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cdd.info" "${TMP_PATH}/version" - downloadFile "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/fasta.tar.gz" "${TMP_PATH}/msa.tar.gz" + downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/mmdb/cdd/cdd.info" "${TMP_PATH}/version" + downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/mmdb/cdd/fasta.tar.gz" "${TMP_PATH}/msa.tar.gz" fi INPUT_TYPE="FASTA_MSA" SED_FIX_LOOKUP='s|\.FASTA||g' diff --git a/lib/mmseqs/data/workflow/taxpercontig.sh b/lib/mmseqs/data/workflow/taxpercontig.sh index 299ac36..f350df2 100644 --- a/lib/mmseqs/data/workflow/taxpercontig.sh +++ b/lib/mmseqs/data/workflow/taxpercontig.sh @@ -45,7 +45,11 @@ if [ -n "${ORF_FILTER}" ]; then fi if notExists "${TMP_PATH}/orfs_aln.list"; then - awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln.list" + # shellcheck disable=SC2086 + "$MMSEQS" recoverlongestorf "${ORFS_DB}" "${TMP_PATH}/orfs_aln" "${TMP_PATH}/orfs_aln_recovered.list" ${THREADS_PAR} \ + || fail "recoverlongestorf died" + awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln_remain.list" + cat "${TMP_PATH}/orfs_aln_recovered.list" "${TMP_PATH}/orfs_aln_remain.list" > "${TMP_PATH}/orfs_aln.list" fi if notExists "${TMP_PATH}/orfs_filter.dbtype"; then diff --git a/lib/mmseqs/data/workflow/tsv2exprofiledb.sh b/lib/mmseqs/data/workflow/tsv2exprofiledb.sh index 3323410..184315d 100644 --- a/lib/mmseqs/data/workflow/tsv2exprofiledb.sh +++ b/lib/mmseqs/data/workflow/tsv2exprofiledb.sh @@ -17,25 +17,37 @@ OUT="$2" [ -d "${OUT}.tsv" ] && echo "${OUT} is a directory!" && exit 1; if notExists "${OUT}_h.dbtype"; then - "$MMSEQS" tsv2db "${IN}_h.tsv" "${OUT}_h" --output-dbtype 12 ${VERBOSITY} + MMSEQS_FORCE_MERGE=1 "$MMSEQS" tsv2db "${IN}_h.tsv" "${OUT}_h" --output-dbtype 12 ${VERBOSITY} fi if notExists "${OUT}.dbtype"; then - "$MMSEQS" tsv2db "${IN}.tsv" "${OUT}_tmp" --output-dbtype 0 ${VERBOSITY} - MMSEQS_FOCE_MERGE=1 "$MMSEQS" compress "${OUT}_tmp" "${OUT}" ${VERBOSITY} - "$MMSEQS" rmdb "${OUT}_tmp" ${VERBOSITY} + if [ -n "${COMPRESSED}" ]; then + "$MMSEQS" tsv2db "${IN}.tsv" "${OUT}_tmp" --output-dbtype 0 ${VERBOSITY} + MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_tmp" "${OUT}" ${VERBOSITY} + "$MMSEQS" rmdb "${OUT}_tmp" ${VERBOSITY} + else + MMSEQS_FORCE_MERGE=1 "$MMSEQS" tsv2db "${IN}.tsv" "${OUT}" --output-dbtype 0 ${VERBOSITY} + fi fi if notExists "${OUT}_seq.dbtype"; then - "$MMSEQS" tsv2db "${IN}_seq.tsv" "${OUT}_seq_tmp" --output-dbtype 0 ${VERBOSITY} - MMSEQS_FOCE_MERGE=1 "$MMSEQS" compress "${OUT}_seq_tmp" "${OUT}_seq" ${VERBOSITY} - "$MMSEQS" rmdb "${OUT}_seq_tmp" ${VERBOSITY} + if [ -n "${COMPRESSED}" ]; then + "$MMSEQS" tsv2db "${IN}_seq.tsv" "${OUT}_seq_tmp" --output-dbtype 0 ${VERBOSITY} + MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_seq_tmp" "${OUT}_seq" ${VERBOSITY} + "$MMSEQS" rmdb "${OUT}_seq_tmp" ${VERBOSITY} + else + "$MMSEQS" tsv2db "${IN}_seq.tsv" "${OUT}_seq" --output-dbtype 0 ${VERBOSITY} + fi fi if notExists "${OUT}_aln.dbtype"; then - "$MMSEQS" tsv2db "${IN}_aln.tsv" "${OUT}_aln_tmp" --output-dbtype 5 ${VERBOSITY} - MMSEQS_FOCE_MERGE=1 "$MMSEQS" compress "${OUT}_aln_tmp" "${OUT}_aln" ${VERBOSITY} - "$MMSEQS" rmdb "${OUT}_aln_tmp" ${VERBOSITY} + if [ -n "${COMPRESSED}" ]; then + "$MMSEQS" tsv2db "${IN}_aln.tsv" "${OUT}_aln_tmp" --output-dbtype 5 ${VERBOSITY} + MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_aln_tmp" "${OUT}_aln" ${VERBOSITY} + "$MMSEQS" rmdb "${OUT}_aln_tmp" ${VERBOSITY} + else + MMSEQS_FORCE_MERGE=1 "$MMSEQS" tsv2db "${IN}_aln.tsv" "${OUT}_aln" --output-dbtype 5 ${VERBOSITY} + fi fi if notExists "${OUT}_seq_h.dbtype"; then diff --git a/lib/mmseqs/src/CMakeLists.txt b/lib/mmseqs/src/CMakeLists.txt index 976ac78..a84726d 100644 --- a/lib/mmseqs/src/CMakeLists.txt +++ b/lib/mmseqs/src/CMakeLists.txt @@ -131,9 +131,9 @@ else () endif () target_link_libraries(mmseqs-framework tinyexpr ${ZSTD_LIBRARIES} microtar) -if (CYGWIN) - target_link_libraries(mmseqs-framework nedmalloc) -endif () +# if (CYGWIN) +# target_link_libraries(mmseqs-framework nedmalloc) +# endif () if (EMSCRIPTEN) target_compile_definitions(mmseqs-framework PUBLIC -DHAVE_ZLIB=1 -DHAVE_BZLIB=1) @@ -222,6 +222,7 @@ if (OPENMP_FOUND) if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") target_link_libraries(mmseqs-framework ${OpenMP_CXX_LIBRARIES}) endif() + target_include_directories(mmseqs-framework PUBLIC ${OpenMP_CXX_INCLUDE_DIRS}) append_target_property(mmseqs-framework COMPILE_FLAGS ${OpenMP_CXX_FLAGS}) append_target_property(mmseqs-framework LINK_FLAGS ${OpenMP_CXX_FLAGS}) elseif (REQUIRE_OPENMP) diff --git a/lib/mmseqs/src/CommandDeclarations.h b/lib/mmseqs/src/CommandDeclarations.h index e8eac29..3c4abb4 100644 --- a/lib/mmseqs/src/CommandDeclarations.h +++ b/lib/mmseqs/src/CommandDeclarations.h @@ -98,6 +98,7 @@ extern int ungappedprefilter(int argc, const char **argv, const Command& command extern int gappedprefilter(int argc, const char **argv, const Command& command); extern int unpackdb(int argc, const char **argv, const Command& command); extern int rbh(int argc, const char **argv, const Command& command); +extern int recoverlongestorf(int argc, const char **argv, const Command& command); extern int result2flat(int argc, const char **argv, const Command& command); extern int result2msa(int argc, const char **argv, const Command& command); extern int result2dnamsa(int argc, const char **argv, const Command& command); diff --git a/lib/mmseqs/src/MMseqsBase.cpp b/lib/mmseqs/src/MMseqsBase.cpp index 282e740..8325f0d 100644 --- a/lib/mmseqs/src/MMseqsBase.cpp +++ b/lib/mmseqs/src/MMseqsBase.cpp @@ -907,6 +907,14 @@ std::vector baseCommands = { "Eli Levy Karin ", " ", CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}}, + {"recoverlongestorf", recoverlongestorf, &par.onlythreads, COMMAND_EXPERT, + "Recover longest ORF for taxonomy annotation after elimination", + NULL, + "Sung-eun Jang", + " ", + CITATION_MMSEQS2, {{"orfDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb}, + {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb}, + {"tsvFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, {"reverseseq", reverseseq, &par.reverseseq, COMMAND_SEQUENCE, "Reverse (without complement) sequences", NULL, @@ -1142,7 +1150,7 @@ std::vector baseCommands = { " ", CITATION_MMSEQS2,{{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}}, - {"tsv2exprofiledb", tsv2exprofiledb, &par.onlyverbosity, COMMAND_PROFILE_PROFILE, + {"tsv2exprofiledb", tsv2exprofiledb, &par.verbandcompression, COMMAND_PROFILE_PROFILE, "Create a expandable profile db from TSV files", NULL, "Milot Mirdita ", diff --git a/lib/mmseqs/src/alignment/Alignment.cpp b/lib/mmseqs/src/alignment/Alignment.cpp index 384dcc2..59625c8 100644 --- a/lib/mmseqs/src/alignment/Alignment.cpp +++ b/lib/mmseqs/src/alignment/Alignment.cpp @@ -419,7 +419,7 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, con // recompute alignment boundaries (without changing evalue) const bool isIdentity = (queryDbKey == swResults[result].dbKey && (includeIdentity || sameQTDB)) ? true : false; - Matcher::result_t res = realigner->getSWResult(&dbSeq, INT_MAX, false, realignCov, covThr, FLT_MAX, realignSwMode, seqIdMode, isIdentity); + Matcher::result_t res = realigner->getSWResult(&dbSeq, INT_MAX, false, covMode, realignCov, FLT_MAX, realignSwMode, seqIdMode, isIdentity); const bool covOK = Util::hasCoverage(realignCov, covMode, res.qcov, res.dbcov); if (covOK == true || isIdentity) { diff --git a/lib/mmseqs/src/commons/MathUtil.h b/lib/mmseqs/src/commons/MathUtil.h index 6808cf8..06eeb3b 100644 --- a/lib/mmseqs/src/commons/MathUtil.h +++ b/lib/mmseqs/src/commons/MathUtil.h @@ -256,6 +256,13 @@ class MathUtil { return sum; } + static float squareDist(const float xx, const float xy, const float xz, + const float yx, const float yy, const float yz){ + float d1 = xx - yx; + float d2 = xy - yy; + float d3 = xz - yz; + return (d1 * d1 + d2 * d2 + d3 * d3); + } }; diff --git a/lib/mmseqs/src/commons/MultiParam.h b/lib/mmseqs/src/commons/MultiParam.h index 989ea2f..966b4f7 100644 --- a/lib/mmseqs/src/commons/MultiParam.h +++ b/lib/mmseqs/src/commons/MultiParam.h @@ -23,21 +23,21 @@ class NuclAA { T first; T second; - NuclAA(const NuclAA &value) { + NuclAA(const NuclAA &value) { this->first = value.first; this->second = value.second; } static const T max; - NuclAA() {} + NuclAA() {} - NuclAA(T first) { + NuclAA(T first) { this->first = first; this->second = first; } - NuclAA(T first, T second) { + NuclAA(T first, T second) { this->first = first; this->second = second; } @@ -88,21 +88,21 @@ class SeqProf { T first; T second; - SeqProf(const SeqProf &value) { + SeqProf(const SeqProf &value) { this->first = value.first; this->second = value.second; } static const T max; - SeqProf() {} + SeqProf() {} - SeqProf(T first) { + SeqProf(T first) { this->first = first; this->second = first; } - SeqProf(T first, T second) { + SeqProf(T first, T second) { this->first = first; this->second = second; } diff --git a/lib/mmseqs/src/commons/Parameters.cpp b/lib/mmseqs/src/commons/Parameters.cpp index c9d0a16..34e4298 100644 --- a/lib/mmseqs/src/commons/Parameters.cpp +++ b/lib/mmseqs/src/commons/Parameters.cpp @@ -283,7 +283,7 @@ Parameters::Parameters(): // taxonomyreport PARAM_REPORT_MODE(PARAM_REPORT_MODE_ID, "--report-mode", "Report mode", "Taxonomy report mode 0: Kraken 1: Krona", typeid(int), (void *) &reportMode, "^[0-1]{1}$"), // createtaxdb - PARAM_NCBI_TAX_DUMP(PARAM_NCBI_TAX_DUMP_ID, "--ncbi-tax-dump", "NCBI tax dump directory", "NCBI tax dump directory. The tax dump can be downloaded here \"ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz\"", typeid(std::string), (void *) &ncbiTaxDump, ""), + PARAM_NCBI_TAX_DUMP(PARAM_NCBI_TAX_DUMP_ID, "--ncbi-tax-dump", "NCBI tax dump directory", "NCBI tax dump directory. The tax dump can be downloaded here \"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz\"", typeid(std::string), (void *) &ncbiTaxDump, ""), PARAM_TAX_MAPPING_FILE(PARAM_TAX_MAPPING_FILE_ID, "--tax-mapping-file", "Taxonomy mapping file", "File to map sequence identifier to taxonomical identifier", typeid(std::string), (void *) &taxMappingFile, ""), PARAM_TAX_MAPPING_MODE(PARAM_TAX_MAPPING_MODE_ID, "--tax-mapping-mode", "Taxonomy mapping mode", "Map taxonomy based on sequence database 0: .lookup file 1: .source file", typeid(int), (void *) &taxMappingMode, "^[0-1]{1}$"), PARAM_TAX_DB_MODE(PARAM_TAX_DB_MODE_ID, "--tax-db-mode", "Taxonomy db mode", "Create taxonomy database as: 0: .dmp flat files (human readable) 1: binary dump (faster readin)", typeid(int), (void *) &taxDbMode, "^[0-1]{1}$"), @@ -1870,7 +1870,7 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c EXIT(EXIT_FAILURE); } filenames.emplace_back(posix); - delete posix; + delete[] posix; } #else filenames.emplace_back(pargv[argIdx]); diff --git a/lib/mmseqs/src/prefiltering/CacheFriendlyOperations.cpp b/lib/mmseqs/src/prefiltering/CacheFriendlyOperations.cpp index 139c8ab..61abc40 100644 --- a/lib/mmseqs/src/prefiltering/CacheFriendlyOperations.cpp +++ b/lib/mmseqs/src/prefiltering/CacheFriendlyOperations.cpp @@ -27,7 +27,7 @@ CacheFriendlyOperations::CacheFriendlyOperations(size_t maxElement, siz } template -CacheFriendlyOperations::~CacheFriendlyOperations(){ +CacheFriendlyOperations::~CacheFriendlyOperations(){ delete[] duplicateBitArray; delete[] binDataFrame; delete[] tmpElementBuffer; diff --git a/lib/mmseqs/src/util/CMakeLists.txt b/lib/mmseqs/src/util/CMakeLists.txt index 7fa7b06..c43c335 100644 --- a/lib/mmseqs/src/util/CMakeLists.txt +++ b/lib/mmseqs/src/util/CMakeLists.txt @@ -47,6 +47,7 @@ set(util_source_files util/profile2pssm.cpp util/profile2neff.cpp util/profile2seq.cpp + util/recoverlongestorf.cpp util/result2dnamsa.cpp util/result2flat.cpp util/result2msa.cpp diff --git a/lib/mmseqs/src/util/convertalignments.cpp b/lib/mmseqs/src/util/convertalignments.cpp index ffa5425..79fb63b 100644 --- a/lib/mmseqs/src/util/convertalignments.cpp +++ b/lib/mmseqs/src/util/convertalignments.cpp @@ -349,6 +349,9 @@ int convertalignments(int argc, const char **argv, const Command &command) { std::string newBacktrace; newBacktrace.reserve(1024); + std::string complementBuffer; + complementBuffer.reserve(1024); + const TaxonNode * taxonNode = NULL; #pragma omp for schedule(dynamic, 10) @@ -640,10 +643,11 @@ int convertalignments(int argc, const char **argv, const Command &command) { float pPositive = 0; int matchCount = 0; if (res.backtrace.empty() == false) { - int qPos = 0; - int tPos = 0; - for (size_t pos = 0; pos < res.backtrace.size(); pos++) { - switch (res.backtrace[pos]) { + int qPos = res.qStartPos; + int tPos = res.dbStartPos; + std::string unpackedBt = Matcher::uncompressAlignment(res.backtrace); + for (size_t pos = 0; pos < unpackedBt.size(); pos++) { + switch (unpackedBt[pos]) { case 'M': { char qRes = queryProfile ? queryProfData[qPos] : querySeqData[qPos]; char tRes = targetProfile ? targetProfData[tPos] : targetSeqData[tPos]; @@ -654,18 +658,38 @@ int convertalignments(int argc, const char **argv, const Command &command) { break; } case 'D': - qPos++; + tPos++; break; case 'I': - tPos++; + qPos++; break; } } - pPositive /= matchCount; + pPositive /= static_cast(matchCount); } result.append(SSTR(pPositive)); break; } + case Parameters::OUTFMT_QFRAME: { + int frame; + if (res.qStartPos <= res.qEndPos) { + frame = (res.qStartPos - 1) % 3 + 1; + } else { + frame = -1 * ((res.qLen - res.qStartPos) % 3 + 1); + } + result.append(SSTR(frame)); + break; + } + case Parameters::OUTFMT_TFRAME: { + int frame; + if (res.dbStartPos <= res.dbEndPos) { + frame = (res.dbStartPos - 1) % 3 + 1; + } else { + frame = -1 * ((res.dbLen - res.dbStartPos) % 3 + 1); + } + result.append(SSTR(frame)); + break; + } } if (i < outcodes.size() - 1) { result.push_back('\t'); @@ -694,32 +718,45 @@ int convertalignments(int argc, const char **argv, const Command &command) { break; } case Parameters::FORMAT_ALIGNMENT_SAM: { - bool strand = res.qEndPos > res.qStartPos; + bool forward = res.qEndPos > res.qStartPos; int rawScore = static_cast(evaluer->computeRawScoreFromBitScore(res.score) + 0.5); uint32_t mapq = -4.343 * log(exp(static_cast(-rawScore))); mapq = (uint32_t) (mapq + 4.99); mapq = mapq < 254 ? mapq : 254; - int count = snprintf(buffer, sizeof(buffer), "%s\t%d\t%s\t%d\t%d\t", queryId.c_str(), (strand) ? 16: 0, targetId.c_str(), res.dbStartPos + 1, mapq); + int count = snprintf(buffer, sizeof(buffer), "%s\t%d\t%s\t%d\t%d\t", queryId.c_str(), (forward) ? 0 : 16, targetId.c_str(), res.dbStartPos + 1, mapq); if (count < 0 || static_cast(count) >= sizeof(buffer)) { Debug(Debug::WARNING) << "Truncated line in entry" << i << "!\n"; continue; } result.append(buffer, count); - if (isTranslatedSearch == true && targetNucs == true && queryNucs == true) { + if (isTranslatedSearch == true && queryNucs == true) { Matcher::result_t::protein2nucl(res.backtrace, newBacktrace); result.append(newBacktrace); newBacktrace.clear(); - } else { result.append(res.backtrace); } result.append("\t*\t0\t0\t"); int start = std::min(res.qStartPos, res.qEndPos); int end = std::max(res.qStartPos, res.qEndPos); - if (queryProfile) { - result.append(queryProfData.c_str() + start, (end + 1) - start); + if (forward == false && queryNucs == true) { + if (queryProfile) { + complementBuffer.assign(queryProfData.c_str() + start, (end + 1) - start); + } else { + complementBuffer.assign(querySeqData + start, (end + 1) - start); + } + std::reverse(complementBuffer.begin(), complementBuffer.end()); + for (size_t i = 0; i < complementBuffer.size(); i++) { + complementBuffer[i] = Orf::complement(complementBuffer[i]); + } + result.append(complementBuffer); + complementBuffer.clear(); } else { - result.append(querySeqData + start, (end + 1) - start); + if (queryProfile) { + result.append(queryProfData.c_str() + start, (end + 1) - start); + } else { + result.append(querySeqData + start, (end + 1) - start); + } } count = snprintf(buffer, sizeof(buffer), "\t*\tAS:i:%d\tNM:i:%d\n", rawScore, missMatchCount); if (count < 0 || static_cast(count) >= sizeof(buffer)) { diff --git a/lib/mmseqs/src/util/createclusterdb.cpp b/lib/mmseqs/src/util/createclusterdb.cpp index 3a8bdb6..72dd5bf 100644 --- a/lib/mmseqs/src/util/createclusterdb.cpp +++ b/lib/mmseqs/src/util/createclusterdb.cpp @@ -43,7 +43,7 @@ int createclusearchdb(int argc, const char **argv, const Command& command) { #ifdef OPENMP thread_idx = static_cast(omp_get_thread_num()); #endif - #pragma omp for schedule(dynamic, 1) + #pragma omp for schedule(static) for (size_t id = 0; id < clusterReader.getSize(); id++) { progress.updateProgress(); char *data = clusterReader.getData(id, thread_idx); diff --git a/lib/mmseqs/src/util/recoverlongestorf.cpp b/lib/mmseqs/src/util/recoverlongestorf.cpp new file mode 100644 index 0000000..b25d894 --- /dev/null +++ b/lib/mmseqs/src/util/recoverlongestorf.cpp @@ -0,0 +1,148 @@ +#include "Parameters.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "Debug.h" +#include "Util.h" +#include "Orf.h" + +#include +#include + +#ifdef OPENMP +#include +#endif + +int recoverlongestorf(int argc, const char **argv, const Command &command) { + Parameters &par = Parameters::getInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + + DBReader headerReader(par.hdr1.c_str(), par.hdr1Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + headerReader.open(DBReader::LINEAR_ACCCESS); + + DBReader resultReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + resultReader.open(DBReader::LINEAR_ACCCESS); + + DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, false, Parameters::DBTYPE_OMIT_FILE); + writer.open(); + + Debug::Progress progress(resultReader.getSize()); + std::unordered_map> contigToLongest; +#pragma omp parallel + { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = static_cast(omp_get_thread_num()); + +#endif + std::unordered_map> localContigToLongest; + +#pragma omp for schedule(dynamic, 100) + for (size_t id = 0; id < headerReader.getSize(); ++id) { + progress.updateProgress(); + unsigned int orfKey = headerReader.getDbKey(id); + char *orfHeader = headerReader.getData(id, thread_idx); + Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader); + unsigned int contigKey = orf.id; + size_t orfLen = std::max(orf.from, orf.to) - std::min(orf.from, orf.to) + 1; + std::unordered_map>::iterator it = localContigToLongest.find(contigKey); + if (it != localContigToLongest.end()) { + std::pair orfKeyToLength = it->second; + if (orfLen > orfKeyToLength.second) { + it->second = std::make_pair(orfKey, orfLen); + } + } else { + localContigToLongest.emplace(contigKey, std::make_pair(orfKey, orfLen)); + } + } + +#pragma omp critical + { + for (auto &entry : localContigToLongest) { + auto &contigKey = entry.first; + auto &localData = entry.second; + auto it = contigToLongest.find(contigKey); + if (it != contigToLongest.end()) { + if (localData.second > it->second.second) { + it->second = localData; + } + } else { + contigToLongest[contigKey] = localData; + } + } + } + } + + progress.reset(resultReader.getSize()); + std::unordered_set acceptedContigs; + std::unordered_set eliminatedContigs; +#pragma omp parallel + { + int thread_idx = 0; +#ifdef OPENMP + thread_idx = omp_get_thread_num(); +#endif + std::string resultBuffer; + resultBuffer.reserve(1024 * 1024); + + std::unordered_set localAcceptedContigs; + std::unordered_set localEliminatedContigs; +#pragma omp for schedule(dynamic, 5) + for (size_t i = 0; i < resultReader.getSize(); ++i) { + progress.updateProgress(); + + unsigned int key = resultReader.getDbKey(i); + size_t entryLength = resultReader.getEntryLen(i); + if (entryLength > 1) { + size_t id = headerReader.getId(key); + char *orfHeader = headerReader.getData(id, thread_idx); + Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader); + unsigned int contigKey = orf.id; + localAcceptedContigs.emplace(contigKey); + } + + size_t id = headerReader.getId(key); + char *orfHeader = headerReader.getData(id, thread_idx); + Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader); + unsigned int contigKey = orf.id; + localEliminatedContigs.emplace(contigKey); + } + +#pragma omp critical + { + acceptedContigs.insert(localAcceptedContigs.begin(), localAcceptedContigs.end()); + eliminatedContigs.insert(localEliminatedContigs.begin(), localEliminatedContigs.end()); + } + } + + for (auto it = eliminatedContigs.begin(); it != eliminatedContigs.end(); ) { + if (acceptedContigs.find(*it) != acceptedContigs.end()) { + it = eliminatedContigs.erase(it); + } else { + ++it; + } + } + + std::string resultBuffer; + resultBuffer.reserve(1024 * 1024); + for (auto contigIt = eliminatedContigs.begin(); contigIt != eliminatedContigs.end(); ++contigIt) { + unsigned int contigKey = *contigIt; + std::unordered_map>::iterator it = contigToLongest.find(contigKey); + if (it != contigToLongest.end()) { + unsigned int orfKey = it->second.first; + resultBuffer.append(SSTR(orfKey)); + resultBuffer.append(1, '\n'); + writer.writeData(resultBuffer.c_str(), resultBuffer.length(), orfKey, 0, false, false); + resultBuffer.clear(); + } else { + Debug(Debug::ERROR) << "Missing contig " << contigKey << "\n"; + EXIT(EXIT_FAILURE); + } + } + + writer.close(true); + FileUtil::remove(writer.getIndexFileName()); + headerReader.close(); + resultReader.close(); + + return EXIT_SUCCESS; +} diff --git a/lib/mmseqs/src/util/summarizeresult.cpp b/lib/mmseqs/src/util/summarizeresult.cpp index 412bf59..7b1c44d 100644 --- a/lib/mmseqs/src/util/summarizeresult.cpp +++ b/lib/mmseqs/src/util/summarizeresult.cpp @@ -94,6 +94,7 @@ int summarizeresult(int argc, const char **argv, const Command &command) { } } writer.close(merge); + reader.close(); #ifdef HAVE_MPI MPI_Barrier(MPI_COMM_WORLD); diff --git a/lib/mmseqs/src/util/tsv2exprofiledb.cpp b/lib/mmseqs/src/util/tsv2exprofiledb.cpp index 96a6029..f1c2173 100644 --- a/lib/mmseqs/src/util/tsv2exprofiledb.cpp +++ b/lib/mmseqs/src/util/tsv2exprofiledb.cpp @@ -6,14 +6,20 @@ #include "tsv2exprofiledb.sh.h" +void setTsv2ExProfileDbDefaults(Parameters *p) { + p->compressed = true; +} + int tsv2exprofiledb(int argc, const char **argv, const Command &command) { Parameters &par = Parameters::getInstance(); + setTsv2ExProfileDbDefaults(&par); par.parseParameters(argc, argv, command, true, 0, 0); std::string program = par.db2 + ".sh"; FileUtil::writeFile(program, tsv2exprofiledb_sh, tsv2exprofiledb_sh_len); CommandCaller cmd; + cmd.addVariable("COMPRESSED", par.compressed ? "TRUE" : NULL); cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str()); cmd.execProgram(FileUtil::getRealPathFromSymLink(program).c_str(), par.filenames); diff --git a/lib/mmseqs/src/workflow/Taxonomy.cpp b/lib/mmseqs/src/workflow/Taxonomy.cpp index 4dbd34c..78bb760 100644 --- a/lib/mmseqs/src/workflow/Taxonomy.cpp +++ b/lib/mmseqs/src/workflow/Taxonomy.cpp @@ -95,6 +95,7 @@ int taxonomy(int argc, const char **argv, const Command& command) { cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); cmd.addVariable("RUNNER", par.runner.c_str()); cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str()); + cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str()); if (searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED && !(searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED)) { diff --git a/lib/mmseqs/util/regression b/lib/mmseqs/util/regression index ac1fc17..5e3bc17 160000 --- a/lib/mmseqs/util/regression +++ b/lib/mmseqs/util/regression @@ -1 +1 @@ -Subproject commit ac1fc179cc06139452da92de754411df780573fa +Subproject commit 5e3bc17e17fb7d34e459bc8ad2bedbf7ded2a038 diff --git a/lib/tmalign/basic_fun.h b/lib/tmalign/basic_fun.h index f23b36c..bc2b53f 100644 --- a/lib/tmalign/basic_fun.h +++ b/lib/tmalign/basic_fun.h @@ -124,5 +124,4 @@ class BasicFunction{ // transform(t, u, x.x[i], x.y[i], x.z[i], y.x[i], y.y[i], y.z[i]); } } - }; \ No newline at end of file diff --git a/regression b/regression index 0c37f22..4c79b42 160000 --- a/regression +++ b/regression @@ -1 +1 @@ -Subproject commit 0c37f22458b39ee61dce65d737a88e3010850f4c +Subproject commit 4c79b428b1b25e648fffd55b20ab4f9a3f36bcd5 diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 491cf44..7621ad6 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -258,8 +258,8 @@ std::vector foldseekCommands = { CITATION_FOLDSEEK, {{"Db", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb }, {"pdbFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, {"scoremultimer", scoremultimer, &localPar.scoremultimer, COMMAND_ALIGNMENT, - "Get complex level alignments from alignmentDB", - "# Get complex level alignments (chain assignments and tm-scores) from alignmentDB.\n" + "Get multimer level alignments from alignmentDB", + "# Get multimer level alignments (chain assignments and tm-scores) from alignmentDB.\n" "foldseek scoremultimer queryDB targetDB alignmentDB complexDB\n" "# simple tsv output format" "foldseek createmultimerreport queryDB targetDB complexDB result.tsv" @@ -277,9 +277,61 @@ std::vector foldseekCommands = { {"scorecomplex", scoremultimer, &localPar.scoremultimer, COMMAND_HIDDEN, "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} }, + {"filtermultimer", filtermultimer, &localPar.filtermultimer, COMMAND_HIDDEN, + "Filters multimers satisfying given coverage", + "foldseek filtermultimer queryDB targetDB alignmentDB complexDB -c 0.8 --cov-mode 1\n", + "Seongeun Kim & Sooyoung Cha ", + " ", + CITATION_FOLDSEEK_MULTIMER, { + {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, + {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, + {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, + {"clustDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb } + } + }, + {"multimercluster", multimercluster, &localPar.multimerclusterworkflow, COMMAND_MAIN, + "Multimer level cluster", + "#Clustering of PDB DB\n" + "foldseek multimercluster queryDB clusterDB tmp\n" + "# --cov-mode \n" + "# Sequence 0 1 2\n" + "# Q: MAVGTACRPA 60% IGN 60%\n" + "# T: -AVGTAC--- 60% 100% IGN\n" + "# Cutoff -c 0.7 - + -\n" + "# -c 0.6 + + +\n\n", + "Seongeun Kim & Sooyoung Cha ", + " ", + CITATION_FOLDSEEK_MULTIMER, { + {"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb}, + {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, + {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } + } + }, + {"easy-multimercluster", easymultimercluster, &localPar.easymultimerclusterworkflow, COMMAND_EASY, + "Multimer level cluster", + "#Clustering of PDB files\n" + "foldseek easy-multimercluster examples/ result tmp\n" + "# Cluster output\n" + "# - result_rep_seq.fasta: Representatives\n" + "# - result_cluster.tsv: Adjacency list\n\n" + "# Important parameter: --cov-mode and -c \n" + "# --cov-mode \n" + "# 0 1 2\n" + "# Q: MAVGTACRPA 60% IGN 60%\n" + "# T: -AVGTAC--- 60% 100% IGN\n" + "# -c 0.7 - + -\n" + "# -c 0.6 + + +\n\n", + "Seongeun Kim & Sooyoung Cha ", + " ... ", + CITATION_FOLDSEEK_MULTIMER, { + {"PDB|mmCIF[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &FoldSeekDbValidator::flatfileStdinAndFolder}, + {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}, + {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } + } + }, {"multimersearch", multimersearch, &localPar.multimersearchworkflow, COMMAND_MAIN, - "Complex level search", - "# Search a single/multiple PDB file against a set of PDB files and get complex level alignments\n" + "Multimer level search", + "# Search a single/multiple PDB file against a set of PDB files and get multimer level alignments\n" "foldseek multimersearch queryDB targetDB result tmp\n" "# Format output differently\n" "foldseek multimersearch queryDB targetDB result tmp --format-output query,target,qstart,tstart,cigar\n" @@ -300,8 +352,8 @@ std::vector foldseekCommands = { "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} }, {"easy-multimersearch", easymultimersearch, &localPar.easymultimersearchworkflow, COMMAND_EASY, - "Complex level search", - "# Search a single/multiple PDB file against a set of PDB files and get complex level alignments\n" + "Multimer level search", + "# Search a single/multiple PDB file against a set of PDB files and get multimer level alignments\n" "foldseek easy-multimersearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp\n" "# Format output differently\n" "foldseek easy-multimersearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp --format-output query,target,qstart,tstart,cigar\n" @@ -324,8 +376,8 @@ std::vector foldseekCommands = { {"createmultimerreport", createmultimerreport, &localPar.createmultimerreport, COMMAND_FORMAT_CONVERSION, "Convert complexDB to tsv format", "# Create output in tsv format (9 columns): qComplexName.c_str(), tComplexName.c_str(), qChainString.c_str(), tChainString.c_str(), qTMScore, tTMScore, u, t, assId\n" - "# (1,2) identifiers for query and target complex,\n" - "# (3,4) chains of query complex and target complex,\n" + "# (1,2) identifiers for query and target multimers,\n" + "# (3,4) chains of query multimer and target multimer,\n" "# (5,6) tm score based on query and target residue length,\n" "# (8,9) u and t,\n" "# (9) assignment id\n" @@ -343,7 +395,7 @@ std::vector foldseekCommands = { "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} }, {"expandmultimer", expandmultimer, &localPar.expandmultimer, COMMAND_PREFILTER, - "Re-prefilter to ensure complete alignment between complexes", + "Re-prefilter to ensure complete alignment between multimers", NULL, "Woosub Kim ", " ", @@ -362,7 +414,7 @@ std::vector foldseekCommands = { NULL, "", "", - CITATION_FOLDSEEK, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}} + CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}} }; std::vector externalThreshold = { {Parameters::DBTYPE_AMINO_ACIDS, 7, 197.0, 11.22}}; @@ -432,10 +484,26 @@ std::vector externalDownloads = { true, Parameters::DBTYPE_AMINO_ACIDS, structdatabases_sh, structdatabases_sh_len, {} }, + { + "BFMD", + "BFMD Big fantastic multimer database (combined multimers from large prediction projects).", + "Kim W et al. Rapid and Sensitive Protein Complex Alignment with Foldseek-Multimer. bioRxiv (2024)", + "https://foldseek.steineggerlab.workers.dev/bfmd.version", + true, Parameters::DBTYPE_AMINO_ACIDS, structdatabases_sh, structdatabases_sh_len, + {} + }, + { + "BFVD", + "BFVD Big fantastic virus database (predicted viral protein structures based on the viral sequence representatives of the UniRef30 clusters", + "Kim R et al. BFVD - a large repository of predicted viral protein structures. Nucleic Acids Research, gkae1119 (2024)", + "https://bfvd.steineggerlab.workers.dev", + true, Parameters::DBTYPE_AMINO_ACIDS, structdatabases_sh, structdatabases_sh_len, + {} + }, { "ProstT5", "Protein language model to predict 3Di directly from sequence.", - "Heinzinger et al. Bilingual Language Model for Protein Sequence and Structure. bioRxiv, 2023.07.23.550085v2 (2024)", + "Heinzinger et al. Bilingual language model for protein sequence and structure. NAR Genomics and Bioinformatics, lqae150 (2024)", "https://huggingface.co/Rostlab/ProstT5", false, Parameters::DBTYPE_AMINO_ACIDS, structdatabases_sh, structdatabases_sh_len, {} diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h index 825a2b7..c362cc1 100644 --- a/src/LocalCommandDeclarations.h +++ b/src/LocalCommandDeclarations.h @@ -21,6 +21,9 @@ extern int structureungappedalign(int argc, const char** argv, const Command &co extern int convert2pdb(int argc, const char** argv, const Command &command); extern int compressca(int argc, const char** argv, const Command &command); extern int scoremultimer(int argc, const char **argv, const Command& command); +extern int filtermultimer(int argc, const char **argv, const Command& command); +extern int easymultimercluster(int argc, const char** argv, const Command &command); +extern int multimercluster(int argc, const char** argv, const Command &command); extern int easymultimersearch(int argc, const char **argv, const Command &command); extern int createmultimerreport(int argc, const char **argv, const Command &command); extern int expandmultimer(int argc, const char **argv, const Command &command); diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 5483966..cf85e35 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -10,6 +10,7 @@ LocalParameters::LocalParameters() : Parameters(), PARAM_PREF_MODE(PARAM_PREF_MODE_ID,"--prefilter-mode", "Prefilter mode", "prefilter mode: 0: kmer/ungapped 1: ungapped, 2: nofilter",typeid(int), (void *) &prefMode, "^[0-2]{1}$"), PARAM_TMSCORE_THRESHOLD(PARAM_TMSCORE_THRESHOLD_ID,"--tmscore-threshold", "TMscore threshold", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &tmScoreThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), + PARAM_TMSCORE_THRESHOLD_MODE(PARAM_TMSCORE_THRESHOLD_MODE_ID,"--tmscore-threshold-mode", "TMscore threshold mode", "0: alignment, 1: query 2: target length",typeid(int), (void *) &tmScoreThrMode, "^[0-2]{1}$"), PARAM_TMALIGN_HIT_ORDER(PARAM_TMALIGN_HIT_ORDER_ID,"--tmalign-hit-order", "TMalign hit order", "order hits by 0: (qTM+tTM)/2, 1: qTM, 2: tTM, 3: min(qTM,tTM) 4: max(qTM,tTM)",typeid(int), (void *) &tmAlignHitOrder, "^[0-4]{1}$"), PARAM_LDDT_THRESHOLD(PARAM_LDDT_THRESHOLD_ID,"--lddt-threshold", "LDDT threshold", "accept alignments with a lddt > thr [0.0,1.0]",typeid(float), (void *) &lddtThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), PARAM_SORT_BY_STRUCTURE_BITS(PARAM_SORT_BY_STRUCTURE_BITS_ID,"--sort-by-structure-bits", "Sort by structure bit score", "sort by bits*sqrt(alnlddt*alntmscore)",typeid(int), (void *) &sortByStructureBits, "^[0-1]{1}$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), @@ -21,7 +22,8 @@ LocalParameters::LocalParameters() : PARAM_EXACT_TMSCORE(PARAM_EXACT_TMSCORE_ID,"--exact-tmscore", "Exact TMscore","turn on fast exact TMscore (slow), default is approximate" ,typeid(int), (void *) &exactTMscore, "^[0-1]{1}$"), PARAM_N_SAMPLE(PARAM_N_SAMPLE_ID, "--n-sample", "Sample size","pick N random sample" ,typeid(int), (void *) &nsample, "^[0-9]{1}[0-9]*$"), PARAM_COORD_STORE_MODE(PARAM_COORD_STORE_MODE_ID, "--coord-store-mode", "Coord store mode", "Coordinate storage mode: \n1: C-alpha as float\n2: C-alpha as difference (uint16_t)", typeid(int), (void *) &coordStoreMode, "^[1-2]{1}$",MMseqsParameter::COMMAND_EXPERT), - PARAM_MIN_ASSIGNED_CHAINS_THRESHOLD(PARAM_MIN_ASSIGNED_CHAINS_THRESHOLD_ID, "--min-assigned-chains-ratio", "Minimum assigned chains percentage Threshold", "minimum percentage of assigned chains out of all query chains > thr [0,100] %", typeid(float), (void *) & minAssignedChainsThreshold, "^[0-9]*(\\.[0-9]+)?$"), + PARAM_MIN_ASSIGNED_CHAINS_THRESHOLD(PARAM_MIN_ASSIGNED_CHAINS_THRESHOLD_ID, "--min-assigned-chains-ratio", "Minimum assigned chains percentage Threshold", "Minimum ratio of assigned chains out of all query chains > thr [0.0,1.0]", typeid(float), (void *) & minAssignedChainsThreshold, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN), + PARAM_MONOMER_INCLUDE_MODE(PARAM_MONOMER_INCLUDE_MODE_ID, "--monomer-include-mode", "Monomer inclusion Mode for MultimerSerch", "Monomer Complex Inclusion 0: include monomers, 1: NOT include monomers", typeid(int), (void *) & monomerIncludeMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_ALIGN), PARAM_CLUSTER_SEARCH(PARAM_CLUSTER_SEARCH_ID, "--cluster-search", "Cluster search", "first find representative then align all cluster members", typeid(int), (void *) &clusterSearch, "^[0-1]{1}$",MMseqsParameter::COMMAND_MISC), PARAM_FILE_INCLUDE(PARAM_FILE_INCLUDE_ID, "--file-include", "File Inclusion Regex", "Include file names based on this regex", typeid(std::string), (void *) &fileInclude, "^.*$"), PARAM_FILE_EXCLUDE(PARAM_FILE_EXCLUDE_ID, "--file-exclude", "File Exclusion Regex", "Exclude file names based on this regex", typeid(std::string), (void *) &fileExclude, "^.*$"), @@ -33,8 +35,14 @@ LocalParameters::LocalParameters() : PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$"), PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), PARAM_PROSTT5_MODEL(PARAM_PROSTT5_MODEL_ID, "--prostt5-model", "Path to ProstT5", "Path to ProstT5 model", typeid(std::string), (void *) &prostt5Model, "^.*$", MMseqsParameter::COMMAND_COMMON), - PARAM_GPU(PARAM_GPU_ID, "--gpu", "Use GPU", "Use GPU (CUDA) if possible", typeid(int), (void *) &gpu, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON) -{ + PARAM_GPU(PARAM_GPU_ID, "--gpu", "Use GPU", "Use GPU (CUDA) if possible", typeid(int), (void *) &gpu, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON), + PARAM_DB_EXTRACTION_MODE(PARAM_DB_EXTRACTION_MODE_ID, "--db-extraction-mode", "Createdb extraction mode", "createdb extraction mode: 0: chain 1: interface", typeid(int), (void *) &dbExtractionMode, "^[0-1]{1}$"), + PARAM_DISTANCE_THRESHOLD(PARAM_DISTANCE_THRESHOLD_ID, "--distance-threshold", "Interface distance threshold", "Residues with C-beta below this threshold will be part of interface", typeid(float), (void *) &distanceThreshold, "^[0-9]*(\\.[0-9]+)?$"), + PARAM_MULTIMER_TM_THRESHOLD(PARAM_MULTIMER_TM_THRESHOLD_ID,"--multimer-tm-threshold", "TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtMultimerTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), + PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "chain TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), + PARAM_INTERFACE_LDDT_THRESHOLD(PARAM_INTERFACE_LDDT_THRESHOLD_ID,"--interface-lddt-threshold", "Interface LDDT threshold", "accept alignments with a lddt > thr [0.0,1.0]",typeid(float), (void *) &filtInterfaceLddtThr, "^0(\\.[0-9]+)?|1(\\.0+)?$") + + { PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id"; PARAM_ALIGNMENT_MODE.regex = "^[0-3]{1}$"; PARAM_ALIGNMENT_MODE.category = MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT; @@ -82,6 +90,8 @@ LocalParameters::LocalParameters() : #endif structurecreatedb.push_back(&PARAM_PROSTT5_MODEL); structurecreatedb.push_back(&PARAM_CHAIN_NAME_MODE); + structurecreatedb.push_back(&PARAM_DB_EXTRACTION_MODE); + structurecreatedb.push_back(&PARAM_DISTANCE_THRESHOLD); structurecreatedb.push_back(&PARAM_WRITE_MAPPING); structurecreatedb.push_back(&PARAM_MASK_BFACTOR_THRESHOLD); structurecreatedb.push_back(&PARAM_COORD_STORE_MODE); @@ -106,6 +116,7 @@ LocalParameters::LocalParameters() : tmalign.push_back(&PARAM_ADD_BACKTRACE); tmalign.push_back(&PARAM_INCLUDE_IDENTITY); tmalign.push_back(&PARAM_TMSCORE_THRESHOLD); + tmalign.push_back(&PARAM_TMSCORE_THRESHOLD_MODE); tmalign.push_back(&PARAM_TMALIGN_HIT_ORDER); tmalign.push_back(&PARAM_TMALIGN_FAST); tmalign.push_back(&PARAM_PRELOAD_MODE); @@ -114,11 +125,13 @@ LocalParameters::LocalParameters() : structurerescorediagonal.push_back(&PARAM_EXACT_TMSCORE); structurerescorediagonal.push_back(&PARAM_TMSCORE_THRESHOLD); + structurerescorediagonal.push_back(&PARAM_TMSCORE_THRESHOLD_MODE); structurerescorediagonal.push_back(&PARAM_LDDT_THRESHOLD); structurerescorediagonal.push_back(&PARAM_ALIGNMENT_TYPE); structurerescorediagonal = combineList(structurerescorediagonal, align); structurealign.push_back(&PARAM_TMSCORE_THRESHOLD); + structurealign.push_back(&PARAM_TMSCORE_THRESHOLD_MODE); structurealign.push_back(&PARAM_LDDT_THRESHOLD); structurealign.push_back(&PARAM_SORT_BY_STRUCTURE_BITS); structurealign.push_back(&PARAM_ALIGNMENT_TYPE); @@ -145,6 +158,7 @@ LocalParameters::LocalParameters() : easystructuresearchworkflow = combineList(structuresearchworkflow, structurecreatedb); easystructuresearchworkflow = combineList(easystructuresearchworkflow, convertalignments); + easystructuresearchworkflow = combineList(easystructuresearchworkflow, taxonomyreport); easystructuresearchworkflow.push_back(&PARAM_GREEDY_BEST_HITS); structureclusterworkflow = combineList(prefilter, structurealign); @@ -182,9 +196,19 @@ LocalParameters::LocalParameters() : //scorecmultimer scoremultimer.push_back(&PARAM_MIN_ASSIGNED_CHAINS_THRESHOLD); + scoremultimer.push_back(&PARAM_MONOMER_INCLUDE_MODE); scoremultimer.push_back(&PARAM_THREADS); scoremultimer.push_back(&PARAM_V); + //filtermultimer + filtermultimer.push_back(&PARAM_C); + filtermultimer.push_back(&PARAM_COV_MODE); + filtermultimer.push_back(&PARAM_MULTIMER_TM_THRESHOLD); + filtermultimer.push_back(&PARAM_CHAIN_TM_THRESHOLD); + filtermultimer.push_back(&PARAM_INTERFACE_LDDT_THRESHOLD); + filtermultimer.push_back(&PARAM_THREADS); + filtermultimer.push_back(&PARAM_V); + // createmultimerreport createmultimerreport.push_back(&PARAM_DB_OUTPUT); createmultimerreport.push_back(&PARAM_THREADS); @@ -204,6 +228,15 @@ LocalParameters::LocalParameters() : easymultimersearchworkflow = combineList(easymultimersearchworkflow, createmultimerreport); easymultimersearchworkflow = removeParameter(easymultimersearchworkflow, PARAM_PROSTT5_MODEL); + + // multimerclusterworkflow + multimerclusterworkflow = combineList(multimersearchworkflow, filtermultimer); + multimerclusterworkflow = combineList(multimerclusterworkflow, clust); + + //easymultimerclusterworkflow + easymultimerclusterworkflow = combineList(structurecreatedb, multimerclusterworkflow); + easymultimerclusterworkflow = combineList(easymultimerclusterworkflow, result2repseq); + // expandmultimer expandmultimer.push_back(&PARAM_THREADS); expandmultimer.push_back(&PARAM_V); @@ -216,6 +249,7 @@ LocalParameters::LocalParameters() : prefMode = PREF_MODE_KMER; alignmentType = ALIGNMENT_TYPE_3DI_AA; tmScoreThr = 0.0; + tmScoreThrMode = TMSCORE_THRESHOLD_MODE_ALIGNMENT; tmAlignHitOrder = TMALIGN_HIT_ORDER_AVG; lddtThr = 0.0; evalThr = 10; @@ -223,6 +257,8 @@ LocalParameters::LocalParameters() : minDiagScoreThr = 30; maskBfactorThreshold = 0; chainNameMode = 0; + minAssignedChainsThreshold = 0.0; + monomerIncludeMode = 0; writeMapping = 0; tmAlignFast = 1; exactTMscore = 0; @@ -241,11 +277,15 @@ LocalParameters::LocalParameters() : eValueThrExpandMultimer = 10000.0; prostt5Model = ""; gpu = 0; - + dbExtractionMode = DB_EXTRACT_MODE_CHAIN; + distanceThreshold = 8.0; + filtMultimerTmThr = 0.0; + filtChainTmThr = 0.0; + filtInterfaceLddtThr = 0.0; citations.emplace(CITATION_FOLDSEEK, "van Kempen, M., Kim, S.S., Tumescheit, C., Mirdita, M., Lee, J., Gilchrist, C.L.M., Söding, J., and Steinegger, M. Fast and accurate protein structure search with Foldseek. Nature Biotechnology, doi:10.1038/s41587-023-01773-0 (2023)"); citations.emplace(CITATION_FOLDSEEK_MULTIMER, "Kim, W., Mirdita, M., Levy Karin, E., Gilchrist, C.L.M., Schweke, H., Söding, J., Levy, E., and Steinegger, M. Rapid and Sensitive Protein Complex Alignment with Foldseek-Multimer. bioRxiv, doi:10.1101/2024.04.14.589414 (2024)"); - citations.emplace(CITATION_PROSTT5, "Heinzinger, M., Weissenow, K., Gomez Sanchez, J., Henkel, A., Mirdita, M., Steinegger, M., Steinegger, M., and Burkhard, R. Bilingual Language Model for Protein Sequence and Structure. bioRxiv, doi:10.1101/2023.07.23.550085 (2024)"); - + citations.emplace(CITATION_PROSTT5, "Heinzinger, M., Weissenow, K., Gomez Sanchez, J., Henkel, A., Mirdita, M., Steinegger, M., and Burkhard, R. Bilingual Language Model for Protein Sequence and Structure. NAR Genomics and Bioinformatics, doi:10.1093/nargab/lqae150 (2024)"); + //rewrite param vals. PARAM_FORMAT_OUTPUT.description = "Choose comma separated list of output columns from: query,target,evalue,gapopen,pident,fident,nident,qstart,qend,qlen\ntstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,mismatch,qcov,tcov\nqset,qsetid,tset,tsetid,taxid,taxname,taxlineage,\nlddt,lddtfull,qca,tca,t,u,qtmscore,ttmscore,alntmscore,rmsd,prob\ncomplexqtmscore,complexttmscore,complexu,complext,complexassignid\n"; } diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index faff6af..042bf2b 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -32,6 +32,11 @@ class LocalParameters : public Parameters { static const int ALIGNMENT_TYPE_TMALIGN = 1; static const int ALIGNMENT_TYPE_3DI_AA = 2; + static const int TMSCORE_THRESHOLD_MODE_ALIGNMENT = 0; + static const int TMSCORE_THRESHOLD_MODE_QUERY = 1; + static const int TMSCORE_THRESHOLD_MODE_TARGET = 2; + static const int TMSCORE_THRESHOLD_MODE_MIN = 3; + static const int PREF_MODE_KMER = 0; static const int PREF_MODE_UNGAPPED = 1; static const int PREF_MODE_EXHAUSTIVE = 2; @@ -65,6 +70,9 @@ class LocalParameters : public Parameters { static const int OUTFMT_COMPLEX_U = 56; static const int OUTFMT_COMPLEX_T = 57; + static const int DB_EXTRACT_MODE_CHAIN = 0; + static const int DB_EXTRACT_MODE_INTERFACE = 1; + static const int COORD_STORE_MODE_CA_FLOAT = 1; static const int COORD_STORE_MODE_CA_DIFF = 2; static const int COORD_STORE_MODE_CA_PLAIN_TEXT = 3; @@ -81,6 +89,11 @@ class LocalParameters : public Parameters { static const int PDB_OUTPUT_MODE_SINGLECHAIN = 1; static const int PDB_OUTPUT_MODE_COMPLEX = 2; + // filter mode + // static const int FILTER_MODE_INTERFACE = 0; + // static const int FILTER_MODE_CONFORMATION = 1; + // static const int FILTER_MODE_LOOSE = 2; + // TODO static const unsigned int FORMAT_ALIGNMENT_PDB_SUPERPOSED = 5; std::vector strucclust; @@ -96,6 +109,9 @@ class LocalParameters : public Parameters { std::vector structurecreatedb; std::vector compressca; std::vector scoremultimer; + std::vector filtermultimer; + std::vector multimerclusterworkflow; + std::vector easymultimerclusterworkflow; std::vector multimersearchworkflow; std::vector easymultimersearchworkflow; std::vector createmultimerreport; @@ -104,6 +120,7 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_PREF_MODE) PARAMETER(PARAM_TMSCORE_THRESHOLD) + PARAMETER(PARAM_TMSCORE_THRESHOLD_MODE) PARAMETER(PARAM_TMALIGN_HIT_ORDER) PARAMETER(PARAM_LDDT_THRESHOLD) PARAMETER(PARAM_SORT_BY_STRUCTURE_BITS) @@ -116,6 +133,7 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_N_SAMPLE) PARAMETER(PARAM_COORD_STORE_MODE) PARAMETER(PARAM_MIN_ASSIGNED_CHAINS_THRESHOLD) + PARAMETER(PARAM_MONOMER_INCLUDE_MODE) PARAMETER(PARAM_CLUSTER_SEARCH) PARAMETER(PARAM_FILE_INCLUDE) PARAMETER(PARAM_FILE_EXCLUDE) @@ -128,9 +146,15 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_PDB_OUTPUT_MODE) PARAMETER(PARAM_PROSTT5_MODEL) PARAMETER(PARAM_GPU) + PARAMETER(PARAM_DB_EXTRACTION_MODE) + PARAMETER(PARAM_DISTANCE_THRESHOLD) + PARAMETER(PARAM_MULTIMER_TM_THRESHOLD) + PARAMETER(PARAM_CHAIN_TM_THRESHOLD) + PARAMETER(PARAM_INTERFACE_LDDT_THRESHOLD) int prefMode; float tmScoreThr; + int tmScoreThrMode; int tmAlignHitOrder; float lddtThr; int sortByStructureBits; @@ -143,6 +167,7 @@ class LocalParameters : public Parameters { int nsample; int coordStoreMode; float minAssignedChainsThreshold; + int monomerIncludeMode; int clusterSearch; std::string fileInclude; std::string fileExclude; @@ -151,8 +176,13 @@ class LocalParameters : public Parameters { double eValueThrExpandMultimer; int inputFormat; int pdbOutputMode; + float filtMultimerTmThr; + float filtChainTmThr; + float filtInterfaceLddtThr; std::string prostt5Model; int gpu; + int dbExtractionMode; + float distanceThreshold; static std::vector getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy, bool &needQCa, bool &needTCa, bool &needTMaligner, diff --git a/src/commons/TMaligner.cpp b/src/commons/TMaligner.cpp index 03bd31d..94a2295 100644 --- a/src/commons/TMaligner.cpp +++ b/src/commons/TMaligner.cpp @@ -8,6 +8,7 @@ #include #include "StructureSmithWaterman.h" #include "StructureSmithWaterman.h" +#include "LocalParameters.h" TMaligner::TMaligner(unsigned int maxSeqLen, bool tmAlignFast, bool tmScoreOnly, bool computeExactScore) : tmAlignFast(tmAlignFast), @@ -323,4 +324,17 @@ Matcher::result_t TMaligner::align(unsigned int dbKey, float *x, float *y, float float qCov = StructureSmithWaterman::computeCov(shiftQ, queryLen-endQ-1, queryLen); float tCov = StructureSmithWaterman::computeCov(shiftT, targetLen-endT-1, targetLen); return Matcher::result_t(dbKey, TM_0*100000 , qCov, tCov, seqId, TM2, backtrace.length(), shiftQ, queryLen-endQ-1, queryLen, shiftT, targetLen-endT-1, targetLen, Matcher::compressAlignment(backtrace)); -} \ No newline at end of file +} + +unsigned int TMaligner::normalization(int mode, unsigned int alignmentLen, unsigned int queryLen, unsigned int targetLen) { + if(mode == LocalParameters::TMSCORE_THRESHOLD_MODE_ALIGNMENT) { + return alignmentLen; + } else if(mode == LocalParameters::TMSCORE_THRESHOLD_MODE_QUERY){ + return queryLen; + } else if(mode == LocalParameters::TMSCORE_THRESHOLD_MODE_TARGET){ + return targetLen; + } else if(mode == LocalParameters::TMSCORE_THRESHOLD_MODE_MIN){ + return std::min(queryLen, targetLen); + } + return 0; +} diff --git a/src/commons/TMaligner.h b/src/commons/TMaligner.h index 2d1488b..9c99fe4 100644 --- a/src/commons/TMaligner.h +++ b/src/commons/TMaligner.h @@ -43,6 +43,8 @@ class TMaligner{ Matcher::result_t align(unsigned int dbKey, float *target_x, float *target_y, float *target_z, char * targetSeq, unsigned int targetLen, float &TM); + static unsigned int normalization(int mode, unsigned int alignmentLen, unsigned int queryLen, unsigned int targetLen); + private: AffineNeedlemanWunsch * affineNW; std::string backtrace; diff --git a/src/strucclustutils/CMakeLists.txt b/src/strucclustutils/CMakeLists.txt index 4d9ce71..f906816 100644 --- a/src/strucclustutils/CMakeLists.txt +++ b/src/strucclustutils/CMakeLists.txt @@ -14,6 +14,7 @@ set(strucclustutils_source_files strucclustutils/convert2pdb.cpp strucclustutils/compressca.cpp strucclustutils/scoremultimer.cpp + strucclustutils/filtermultimer.cpp strucclustutils/createmultimerreport.cpp strucclustutils/MultimerUtil.h strucclustutils/expandmultimer.cpp diff --git a/src/strucclustutils/GemmiWrapper.cpp b/src/strucclustutils/GemmiWrapper.cpp index c638e65..585fe53 100644 --- a/src/strucclustutils/GemmiWrapper.cpp +++ b/src/strucclustutils/GemmiWrapper.cpp @@ -13,31 +13,164 @@ #include GemmiWrapper::GemmiWrapper(){ - threeAA2oneAA = {{"ALA",'A'}, {"ARG",'R'}, {"ASN",'N'}, {"ASP",'D'}, - {"CYS",'C'}, {"GLN",'Q'}, {"GLU",'E'}, {"GLY",'G'}, - {"HIS",'H'}, {"ILE",'I'}, {"LEU",'L'}, {"LYS",'K'}, - {"MET",'M'}, {"PHE",'F'}, {"PRO",'P'}, {"SER",'S'}, - {"THR",'T'}, {"TRP",'W'}, {"TYR",'Y'}, {"VAL",'V'}, - // modified res - {"MSE",'M'}, {"MLY",'K'}, {"FME",'M'}, {"HYP",'P'}, - {"TPO",'T'}, {"CSO",'C'}, {"SEP",'S'}, {"M3L",'K'}, - {"HSK",'H'}, {"SAC",'S'}, {"PCA",'E'}, {"DAL",'A'}, - {"CME",'C'}, {"CSD",'C'}, {"OCS",'C'}, {"DPR",'P'}, - {"B3K",'K'}, {"ALY",'K'}, {"YCM",'C'}, {"MLZ",'K'}, - {"4BF",'Y'}, {"KCX",'K'}, {"B3E",'E'}, {"B3D",'D'}, - {"HZP",'P'}, {"CSX",'C'}, {"BAL",'A'}, {"HIC",'H'}, - {"DBZ",'A'}, {"DCY",'C'}, {"DVA",'V'}, {"NLE",'L'}, - {"SMC",'C'}, {"AGM",'R'}, {"B3A",'A'}, {"DAS",'D'}, - {"DLY",'K'}, {"DSN",'S'}, {"DTH",'T'}, {"GL3",'G'}, - {"HY3",'P'}, {"LLP",'K'}, {"MGN",'Q'}, {"MHS",'H'}, - {"TRQ",'W'}, {"B3Y",'Y'}, {"PHI",'F'}, {"PTR",'Y'}, - {"TYS",'Y'}, {"IAS",'D'}, {"GPL",'K'}, {"KYN",'W'}, - {"CSD",'C'}, {"SEC",'C'}, - // unknown - {"UNK",'X'}}; fixupBuffer = NULL; } +// adapted from Gemmi find_tabulated_residue +char threeToOneAA(const std::string& three) { + if (three.size() != 3) { + return 'X'; + } +#define ID(s) (s[0] << 16 | s[1] << 8 | s[2]) + switch (ID(three.c_str())) { + case ID("ALA"): return 'A'; + case ID("ARG"): return 'R'; + case ID("ASN"): return 'N'; + case ID("ABA"): return 'A'; + case ID("ASP"): return 'D'; + case ID("ASX"): return 'B'; + case ID("CYS"): return 'C'; // also BUF + case ID("CSH"): return 'S'; + case ID("GLN"): return 'Q'; + case ID("GLU"): return 'E'; + case ID("GLX"): return 'Z'; + case ID("GLY"): return 'G'; // also BUF + case ID("HIS"): return 'H'; + case ID("ILE"): return 'I'; + case ID("LEU"): return 'L'; + case ID("LYS"): return 'K'; + case ID("MET"): return 'M'; + case ID("MSE"): return 'M'; + case ID("ORN"): return 'A'; + case ID("PHE"): return 'F'; + case ID("PRO"): return 'P'; + case ID("SER"): return 'S'; + case ID("THR"): return 'T'; + case ID("TRY"): // fall-through - synonym for TRP + case ID("TRP"): return 'W'; + case ID("TYR"): return 'Y'; + case ID("UNK"): return 'X'; + case ID("VAL"): return 'V'; + case ID("SEC"): return 'C'; // mapped to U in gemmi, C in previous FS + case ID("PYL"): return 'O'; + case ID("SEP"): return 'S'; + case ID("TPO"): return 'T'; + case ID("PCA"): return 'E'; + case ID("CSO"): return 'C'; + case ID("PTR"): return 'Y'; + case ID("KCX"): return 'K'; + case ID("CSD"): return 'C'; + case ID("LLP"): return 'K'; + case ID("CME"): return 'C'; + case ID("MLY"): return 'K'; + case ID("DAL"): return 'A'; + case ID("TYS"): return 'Y'; + case ID("OCS"): return 'C'; + case ID("M3L"): return 'K'; + case ID("FME"): return 'M'; + case ID("ALY"): return 'K'; + case ID("HYP"): return 'P'; + case ID("CAS"): return 'C'; + case ID("CRO"): return 'T'; + case ID("CSX"): return 'C'; + case ID("DPR"): return 'P'; // also BUF + case ID("DGL"): return 'E'; + case ID("DVA"): return 'V'; + case ID("CSS"): return 'C'; + case ID("DPN"): return 'F'; + case ID("DSN"): return 'S'; + case ID("DLE"): return 'L'; + case ID("HIC"): return 'H'; + case ID("NLE"): return 'L'; + case ID("MVA"): return 'V'; + case ID("MLZ"): return 'K'; + case ID("CR2"): return 'G'; + case ID("SAR"): return 'G'; + case ID("DAR"): return 'R'; + case ID("DLY"): return 'K'; + case ID("YCM"): return 'C'; + case ID("NRQ"): return 'M'; + case ID("CGU"): return 'E'; + case ID("0TD"): return 'D'; + case ID("MLE"): return 'L'; + case ID("DAS"): return 'D'; + case ID("DTR"): return 'W'; + case ID("CXM"): return 'M'; + case ID("TPQ"): return 'Y'; + case ID("DCY"): return 'C'; + case ID("DSG"): return 'N'; + case ID("DTY"): return 'Y'; + case ID("DHI"): return 'H'; + case ID("MEN"): return 'N'; + case ID("DTH"): return 'T'; + case ID("SAC"): return 'S'; + case ID("DGN"): return 'Q'; + case ID("AIB"): return 'A'; + case ID("SMC"): return 'C'; + case ID("IAS"): return 'D'; + case ID("CIR"): return 'R'; + case ID("BMT"): return 'T'; + case ID("DIL"): return 'I'; + case ID("FGA"): return 'E'; + case ID("PHI"): return 'F'; + case ID("CRQ"): return 'Q'; + case ID("SME"): return 'M'; + case ID("GHP"): return 'G'; + case ID("MHO"): return 'M'; + case ID("NEP"): return 'H'; + case ID("TRQ"): return 'W'; + case ID("TOX"): return 'W'; + case ID("ALC"): return 'A'; + // case ID("3FG"): return ' '; + case ID("SCH"): return 'C'; + case ID("MDO"): return 'A'; + case ID("MAA"): return 'A'; + case ID("GYS"): return 'S'; + case ID("MK8"): return 'L'; + case ID("CR8"): return 'H'; + case ID("KPI"): return 'K'; + case ID("SCY"): return 'C'; + case ID("DHA"): return 'S'; + case ID("OMY"): return 'Y'; + case ID("CAF"): return 'C'; + case ID("0AF"): return 'W'; + case ID("SNN"): return 'N'; + case ID("MHS"): return 'H'; + // case ID("MLU"): return ' '; + case ID("SNC"): return 'C'; + case ID("PHD"): return 'D'; + case ID("B3E"): return 'E'; + case ID("MEA"): return 'F'; + case ID("MED"): return 'M'; + case ID("OAS"): return 'S'; + case ID("GL3"): return 'G'; + case ID("FVA"): return 'V'; + case ID("PHL"): return 'F'; + case ID("CRF"): return 'T'; + // case ID("OMZ"): return ' '; + case ID("BFD"): return 'D'; + case ID("MEQ"): return 'Q'; + case ID("DAB"): return 'A'; + case ID("AGM"): return 'R'; + // added from previous FS code + case ID("4BF"): return 'Y'; + case ID("B3A"): return 'A'; + case ID("B3D"): return 'D'; + case ID("B3K"): return 'K'; + case ID("B3Y"): return 'Y'; + case ID("BAL"): return 'A'; + case ID("DBZ"): return 'A'; + case ID("GPL"): return 'K'; + case ID("HSK"): return 'H'; + case ID("HY3"): return 'P'; + case ID("HZP"): return 'P'; + case ID("KYN"): return 'W'; + case ID("MGN"): return 'Q'; + } + return 'X'; +#undef ID +} + std::unordered_map getEntityTaxIDMapping(gemmi::cif::Document& doc) { std::unordered_map entity_to_taxid; static const std::vector> loops_with_taxids = { @@ -263,7 +396,8 @@ bool GemmiWrapper::loadFoldcompStructure(std::istream& stream, const std::string if (coordinates.size() == 0) { return false; } - + seq3di.clear(); + taxIds.clear(); title.clear(); chain.clear(); names.clear(); @@ -302,11 +436,7 @@ bool GemmiWrapper::loadFoldcompStructure(std::istream& stream, const std::string c_atom = {NAN, NAN, NAN}; ca_atom_bfactor = 0.0; } - if (threeAA2oneAA.find(atom.residue) == threeAA2oneAA.end()) { - ami.push_back('X'); - } else { - ami.push_back(threeAA2oneAA[atom.residue]); - } + ami.push_back(threeToOneAA(atom.residue)); residueIndex = atom.residue_index; } @@ -369,39 +499,23 @@ void GemmiWrapper::updateStructure(void * void_st, const std::string& filename, names.push_back(name); int taxId = -1; - for (gemmi::Residue &res : ch.first_conformer()) { - if (taxId == -1) { - auto it = entity_to_tax_id.find(res.entity_id); - if (it != entity_to_tax_id.end()) { - taxId = it->second; - } + for (const gemmi::Residue &res : ch.first_conformer()) { + // only consider polymers and unknowns + if (res.entity_type != gemmi::EntityType::Polymer && res.entity_type != gemmi::EntityType::Unknown) { + continue; } - bool isHetAtomInList = res.het_flag == 'H' && threeAA2oneAA.find(res.name) != threeAA2oneAA.end(); - if (isHetAtomInList == false && res.het_flag != 'A') + // only consider ATOM and HETATM + if (res.het_flag == '\0') { continue; - if (isHetAtomInList) { - bool notPolymer = res.entity_type != gemmi::EntityType::Polymer; - if (notPolymer == true) { - continue; - } - bool hasCA = false; - for (gemmi::Atom &atom : res.atoms) { - if (atom.name == "CA") { - hasCA = true; - break; - } - } - if (hasCA == false) { - continue; - } } + Vec3 ca_atom = {NAN, NAN, NAN}; Vec3 cb_atom = {NAN, NAN, NAN}; Vec3 n_atom = {NAN, NAN, NAN}; Vec3 c_atom = {NAN, NAN, NAN}; - float ca_atom_bfactor; + float ca_atom_bfactor = 0.0f; bool hasCA = false; - for (gemmi::Atom &atom : res.atoms) { + for (const gemmi::Atom &atom : res.atoms) { if (atom.name == "CA") { ca_atom.x = atom.pos.x; ca_atom.y = atom.pos.y; @@ -422,7 +536,7 @@ void GemmiWrapper::updateStructure(void * void_st, const std::string& filename, c_atom.z = atom.pos.z; } } - if(hasCA == false){ + if (hasCA == false) { continue; } ca_bfactor.push_back(ca_atom_bfactor); @@ -430,12 +544,16 @@ void GemmiWrapper::updateStructure(void * void_st, const std::string& filename, cb.push_back(cb_atom); n.push_back(n_atom); c.push_back(c_atom); - currPos++; - if (threeAA2oneAA.find(res.name) == threeAA2oneAA.end()) { - ami.push_back('X'); - } else { - ami.push_back(threeAA2oneAA[res.name]); + + ami.push_back(threeToOneAA(res.name)); + + if (taxId == -1) { + auto it = entity_to_tax_id.find(res.entity_id); + if (it != entity_to_tax_id.end()) { + taxId = it->second; + } } + currPos++; } taxIds.push_back(taxId == -1 ? 0 : taxId); chain.push_back(std::make_pair(chainStartPos, currPos)); diff --git a/src/strucclustutils/GemmiWrapper.h b/src/strucclustutils/GemmiWrapper.h index 7ea0902..691a743 100644 --- a/src/strucclustutils/GemmiWrapper.h +++ b/src/strucclustutils/GemmiWrapper.h @@ -40,6 +40,7 @@ class GemmiWrapper { std::vector c; std::vector cb; std::vector ami; + std::vector seq3di; std::vector names; std::vector chainNames; std::vector modelIndices; @@ -52,7 +53,6 @@ class GemmiWrapper { size_t fixupBufferSize; private: - std::unordered_map threeAA2oneAA; int modelIt; int chainIt; diff --git a/src/strucclustutils/MultimerUtil.h b/src/strucclustutils/MultimerUtil.h index 6d8c666..eba164e 100644 --- a/src/strucclustutils/MultimerUtil.h +++ b/src/strucclustutils/MultimerUtil.h @@ -5,24 +5,21 @@ #include "TMaligner.h" const unsigned int NOT_AVAILABLE_CHAIN_KEY = 4294967295; -const double MAX_ASSIGNED_CHAIN_RATIO = 1.0; +const float MAX_ASSIGNED_CHAIN_RATIO = 1.0; const double TOO_SMALL_MEAN = 1.0; const double TOO_SMALL_CV = 0.1; const double FILTERED_OUT = 0.0; const unsigned int INITIALIZED_LABEL = 0; const unsigned int MIN_PTS = 2; -const float DEFAULT_EPS = 0.1; const float LEARNING_RATE = 0.1; const float TM_SCORE_MARGIN = 0.7; -const float DEF_TM_SCORE = -1.0; -const int UNINITIALIZED = 0; const unsigned int MULTIPLE_CHAINED_COMPLEX = 2; const unsigned int SIZE_OF_SUPERPOSITION_VECTOR = 12; +const int SKIP_MONOMERS = 1; typedef std::pair compNameChainName_t; typedef std::map chainKeyToComplexId_t; typedef std::map> complexIdToChainKeys_t; typedef std::vector cluster_t; -typedef std::map, float> distMap_t; typedef std::string resultToWrite_t; typedef std::string chainName_t; typedef std::pair resultToWriteWithKey_t; @@ -96,13 +93,12 @@ struct ChainToChainAln { unsigned int label; float tmScore; - float getDistance(const ChainToChainAln &o) { + float getDistance(const ChainToChainAln &o) const { float dist = 0; for (size_t i=0; i(entry[15]); @@ -229,4 +225,28 @@ static ComplexDataHandler parseScoreComplexResult(const char *data, Matcher::res return {assId, qTmScore, tTmScore, uString, tString, true}; } -#endif //FOLDSEEK_MULTIMERUTIL_H \ No newline at end of file +static char* fastfloatToBuffer(float value, char* buffer) { + if (value < 0) { + value *= -1; + *(buffer) = '-'; + buffer++; + } + int value1 = (int)(value); + buffer = Itoa::i32toa_sse2(value1, buffer); + *(buffer) = '.'; + buffer++; + + double value2 = value - value1; + if (value2 < 0.1){ + *(buffer) = '0'; + buffer++; + } + if (value2 < 0.01){ + *(buffer) = '0'; + buffer++; + } + buffer = Itoa::i32toa_sse2((int)(value2 * 1000), buffer); + return buffer; +} + +#endif //FOLDSEEK_MULTIMERUTIL_H diff --git a/src/strucclustutils/aln2tmscore.cpp b/src/strucclustutils/aln2tmscore.cpp index b1d3c8a..73807c2 100644 --- a/src/strucclustutils/aln2tmscore.cpp +++ b/src/strucclustutils/aln2tmscore.cpp @@ -103,8 +103,7 @@ int aln2tmscore(int argc, const char **argv, const Command& command) { // Matching residue index collection TMaligner::TMscoreResult tmres = tmaln.computeTMscore(tdata, &tdata[targetLen], &tdata[targetLen + targetLen], targetLen, res.qStartPos, res.dbStartPos, res.backtrace, - std::min(static_cast(res.backtrace.length()), std::min(res.dbLen, res.qLen))); - + TMaligner::normalization(par.tmScoreThrMode, std::min(res.qEndPos - res.qStartPos, res.dbEndPos - res.dbStartPos ), res.qLen, res.dbLen)); //std::cout << TMalnScore << std::endl; resultsStr.append(SSTR(dbKey)); resultsStr.push_back(' '); @@ -153,4 +152,4 @@ int aln2tmscore(int argc, const char **argv, const Command& command) { } return EXIT_SUCCESS; -} \ No newline at end of file +} diff --git a/src/strucclustutils/createmultimerreport.cpp b/src/strucclustutils/createmultimerreport.cpp index 1256d5c..ff12575 100644 --- a/src/strucclustutils/createmultimerreport.cpp +++ b/src/strucclustutils/createmultimerreport.cpp @@ -108,7 +108,6 @@ int createmultimerreport(int argc, const char **argv, const Command &command) { std::string qLookupFile = par.db1 + ".lookup"; TranslateNucl translateNucl(static_cast(par.translationTable)); - Matcher::result_t res; std::map qChainKeyToComplexIdMap; std::map> qComplexIdToChainKeyMap; std::vector qComplexIdVec; @@ -178,9 +177,9 @@ int createmultimerreport(int argc, const char **argv, const Command &command) { } // MP end SORT_PARALLEL(complexResults.begin(), complexResults.end(), compareComplexResultByQuery); for (size_t complexResIdx = 0; complexResIdx < complexResults.size(); complexResIdx++) { - const ScoreComplexResult& res = complexResults[complexResIdx]; - const resultToWrite_t& data = res.resultToWrite; - resultWriter.writeData(data.c_str(), data.length(), res.assId, 0, isDb, isDb); + const ScoreComplexResult& cRes = complexResults[complexResIdx]; + const resultToWrite_t& data = cRes.resultToWrite; + resultWriter.writeData(data.c_str(), data.length(), cRes.assId, 0, isDb, isDb); } resultWriter.close(true); if (isDb == false) { diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp new file mode 100644 index 0000000..005502e --- /dev/null +++ b/src/strucclustutils/filtermultimer.cpp @@ -0,0 +1,787 @@ +#include "DBWriter.h" +#include "Util.h" +#include "LocalParameters.h" +#include "DBReader.h" +#include "IndexReader.h" +#include "FileUtil.h" +#include "Coordinate16.h" +#include "tmalign/basic_fun.h" +#include "MultimerUtil.h" +#include "LDDT.h" +#include +#include + +#ifdef OPENMP +#include +#endif + +struct Complex { + int complexId; + unsigned int nChain; + unsigned int complexLength; + std::string complexName; + std::vector chainLengths; + std::vector chainKeys; + + // Coordinate16 Coords; + + Complex() : complexId(0), nChain(0), complexLength(0), complexName("") {} + ~Complex() { + chainKeys.clear(); + } +}; + +struct AlignedCoordinate { + std::vector x; + std::vector y; + std::vector z; + AlignedCoordinate() {} + AlignedCoordinate(size_t size) { + x.resize(size); + y.resize(size); + z.resize(size); + } + ~AlignedCoordinate() { + x.clear(); + y.clear(); + z.clear(); + } +}; + +unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return (qcov+tcov)/2; + case Parameters::COV_MODE_TARGET: + return qcov; + case Parameters::COV_MODE_QUERY: + return tcov; + default: + return 0; + } +} + +class ComplexFilterCriteria { +public: + unsigned int targetComplexId; + + // per complex + unsigned int qTotalAlnLen; + unsigned int tTotalAlnLen; + float qCov; + float tCov; + float interfaceLddt; + double qTm; + double tTm; + double avgTm; + float t[3]; + float u[3][3]; + + // per chain : criteria for chainTmThr & lddtThr + std::vector qAlnChainKeys; + std::vector tAlnChainKeys; + std::vector qAlnChains; + std::vector tAlnChains; + + std::vector qAlnChainTms; + std::vector tAlnChainTms; + + ComplexFilterCriteria() {} + ComplexFilterCriteria(unsigned int targetComplexId, double qTm, double tTm, float tstring[3], float ustring[3][3]) : + targetComplexId(targetComplexId), qTotalAlnLen(0), tTotalAlnLen(0), interfaceLddt(0), qTm(qTm), tTm(tTm) { + std::copy(tstring, tstring + 3, t); + for (int i = 0; i < 3; i++) { + std::copy(ustring[i], ustring[i] + 3, u[i]); + } + } + ~ComplexFilterCriteria() { + qAlnChainTms.clear(); + tAlnChainTms.clear(); + qAlnChainKeys.clear(); + tAlnChainKeys.clear(); + qAlnChains.clear(); + tAlnChains.clear(); + } + + bool hasTm(float TmThr, int covMode) { + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return ((qTm>= TmThr) && (tTm >= TmThr)); + case Parameters::COV_MODE_TARGET: + return (tTm >= TmThr); + case Parameters::COV_MODE_QUERY: + return (qTm >= TmThr); + default: + return true; + } + } + + bool hasChainTm(float chainTmThr, int covMode, unsigned int qChainNum, unsigned int tChainNum) { + if (qAlnChainTms.size()= iLddtThr); + } + bool satisfy(int covMode, float covThr, float TmThr, float chainTmThr, float iLddtThr, size_t qChainNum, size_t tChainNum ) { + const bool covOK = covThr ? Util::hasCoverage(covThr, covMode, qCov, tCov) : true; + const bool TmOK = TmThr ? hasTm(TmThr, covMode) : true; + const bool chainTmOK = chainTmThr ? hasChainTm(chainTmThr, covMode, qChainNum, tChainNum) : true; + const bool chainNumOK = hasChainNum(covMode, qChainNum, tChainNum); + const bool lddtOK = iLddtThr ? hasInterfaceLDDT(iLddtThr, qChainNum, tChainNum) : true; + // calculateAvgTm(covMode); + return (covOK && TmOK && chainTmOK && lddtOK && chainNumOK); + } + + void updateAln(unsigned int qAlnLen, unsigned int tAlnLen) { + qTotalAlnLen += qAlnLen; + tTotalAlnLen += tAlnLen; + } + + void updateChainTmScore(double qChainTm, double tChainTm) { + qAlnChainTms.push_back(qChainTm); + tAlnChainTms.push_back(tChainTm); + } + + void fillChainAlignment(unsigned int qChainKey, unsigned int tChainKey, unsigned int alnLen, + float *qdata, float *tdata, const std::string &cigar, int qStartPos, int tStartPos, int qLen, int tLen) { + AlignedCoordinate qChain; + AlignedCoordinate tChain; + int qi = qStartPos; + int ti = tStartPos; + int mi = 0; + std::string backtrace = Matcher::uncompressAlignment(cigar); + + qChain.x.resize(alnLen); + qChain.y.resize(alnLen); + qChain.z.resize(alnLen); + tChain.x.resize(alnLen); + tChain.y.resize(alnLen); + tChain.z.resize(alnLen); + + for (size_t btPos = 0; btPos < backtrace.size(); btPos++) { + if (backtrace[btPos] == 'M') { + qChain.x[mi] = qdata[qi]; + qChain.y[mi] = qdata[qLen + qi]; + qChain.z[mi] = qdata[2*qLen + qi]; + tChain.x[mi] = tdata[ti]; + tChain.y[mi] = tdata[tLen + ti]; + tChain.z[mi] = tdata[2*tLen + ti]; + qi++; + ti++; + mi++; + } + else if (backtrace[btPos] == 'I') { + qi++; + } + else { + ti++; + } + } + qAlnChainKeys.push_back(qChainKey); + tAlnChainKeys.push_back(tChainKey); + qAlnChains.push_back(qChain); + tAlnChains.push_back(tChain); + } + // void update(unsigned int qChainKey, unsigned int tChainKey, double qChainTm, double tChainTm) { + // this->qAlnChainTms.push_back(qChainTm); + // this->tAlnChainTms.push_back(tChainTm); + + // this->qAlnChainKeys.push_back(qChainKey); + // this->tAlnChainKeys.push_back(tChainKey); + // } + + void calcCov(unsigned int qLen, unsigned int tLen) { + qCov = static_cast(qTotalAlnLen) / static_cast(qLen); + tCov = static_cast(tTotalAlnLen) / static_cast(tLen); + } + + void computeInterfaceLddt(float threshold = 8) { + if (qAlnChains.size() == 1) { + interfaceLddt = 1; + } + float t2 = threshold * threshold; + std::vector> qInterfacePos(qAlnChains.size()); // chainIdx, resIdx + unsigned int intLen = 0; + // Find and save interface Coordinates + for (size_t chainIdx1 = 0; chainIdx1 < qAlnChains.size(); chainIdx1++) { + for (size_t chainIdx2 = chainIdx1+1; chainIdx2 < qAlnChains.size(); chainIdx2++) { + AlignedCoordinate qChain1 = qAlnChains[chainIdx1]; + AlignedCoordinate qChain2 = qAlnChains[chainIdx2]; + for (size_t resIdx1 = 0; resIdx1 < qChain1.x.size(); resIdx1++) { + for (size_t resIdx2 = 0; resIdx2 < qChain2.x.size(); resIdx2++) { + float dist = BasicFunction::dist(qChain1.x[resIdx1], qChain1.y[resIdx1], qChain1.z[resIdx1], + qChain2.x[resIdx2], qChain2.y[resIdx2], qChain2.z[resIdx2]); + if (dist < t2) { + if (qInterfacePos[chainIdx1].find(resIdx1) == qInterfacePos[chainIdx1].end()) { + qInterfacePos[chainIdx1].insert(resIdx1); + intLen++; + } + if (qInterfacePos[chainIdx2].find(resIdx2) == qInterfacePos[chainIdx2].end()) { + qInterfacePos[chainIdx2].insert(resIdx2); + intLen++; + } + } + } + } + } + } + + if (intLen == 0) { + return; + } + AlignedCoordinate qInterface(intLen); + AlignedCoordinate tInterface(intLen); + size_t idx = 0; + for (size_t chainIdx = 0; chainIdx < qInterfacePos.size(); chainIdx++) { + if (qInterfacePos[chainIdx].size() >= 4) { + for (size_t resIdx: qInterfacePos[chainIdx]) { + qInterface.x[idx] = qAlnChains[chainIdx].x[resIdx]; + qInterface.y[idx] = qAlnChains[chainIdx].y[resIdx]; + qInterface.z[idx] = qAlnChains[chainIdx].z[resIdx]; + tInterface.x[idx] = tAlnChains[chainIdx].x[resIdx]; + tInterface.y[idx] = tAlnChains[chainIdx].y[resIdx]; + tInterface.z[idx] = tAlnChains[chainIdx].z[resIdx]; + idx++; + } + } + } + std::string bt(intLen, 'M'); + LDDTCalculator *lddtcalculator = NULL; + lddtcalculator = new LDDTCalculator(intLen+1, intLen+1); + lddtcalculator->initQuery(intLen, &qInterface.x[0], &qInterface.y[0], &qInterface.z[0]); + LDDTCalculator::LDDTScoreResult lddtres = lddtcalculator->computeLDDTScore(intLen, 0, 0, bt, &tInterface.x[0], &tInterface.y[0], &tInterface.z[0]); + interfaceLddt = lddtres.avgLddtScore; + delete lddtcalculator; + } +}; + + +char* filterToBuffer(ComplexFilterCriteria cmplfiltcrit, char* tmpBuff){ + *(tmpBuff-1) = '\t'; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.qCov, tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.tCov, tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.qTm, tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.tTm, tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.interfaceLddt, tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[0][0], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[0][1], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[0][2], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[1][0], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[1][1], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[1][2], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[2][0], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[2][1], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[2][2], tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.t[0], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.t[1], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.t[2], tmpBuff); + *(tmpBuff-1) = '\n'; + return tmpBuff; +} + +void fillUArr(const std::string &uString, float (&u)[3][3]) { + std::string tmp; + int i = 0; + int j=0; + const int ulen = static_cast(uString.size()); + for (int k=0; k < ulen; k++) { + if (k==ulen-1) { + u[i][j] = std::stof(tmp); + } else if (uString[k] == ',') { + u[i][j] = std::stof(tmp); + tmp.clear(); + j++; + } else { + tmp.push_back(uString[k]); + } + if (j == 3) { + i++; + j = 0; + } + } +} + +void fillTArr(const std::string &tString, float (&t)[3]) { + std::string tmp; + int i = 0; + const int tlen = static_cast(tString.size()); + for (int k=0; k &complexes) { + for (size_t complexIdx = 0; complexIdx < complexes.size(); complexIdx++) { + Complex *complex = &complexes[complexIdx]; + std::vector &chainKeys = complex->chainKeys; + if (chainKeys.empty()) { + continue; + } + unsigned int cmpllen = 0; + for (auto chainKey: chainKeys) { + size_t id = Dbr->sequenceReader->getId(chainKey); + if (id == NOT_AVAILABLE_CHAIN_KEY) { + break; + } + unsigned int reslen = Dbr->sequenceReader->getSeqLen(id); + complex->chainLengths.push_back(reslen); + cmpllen += Dbr->sequenceReader->getSeqLen(id); + } + complex->complexLength = cmpllen; + } +} + +static void getlookupInfo( + const std::string &file, + std::map &chainKeyToComplexIdLookup, + std::vector &complexes, + std::map &complexIdtoIdx +) { + if (file.length() == 0) { + return; + } + MemoryMapped lookupDB(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); + char *data = (char *) lookupDB.getData(); + char *end = data + lookupDB.mappedSize(); + const char *entry[255]; + + int prevComplexId = -1; + int nComplex = 0; + while (data < end && *data != '\0') { + const size_t columns = Util::getWordsOfLine(data, entry, 255); + if (columns < 3) { + Debug(Debug::WARNING) << "Not enough columns in lookup file " << file << "\n"; + continue; + } + auto chainKey = Util::fast_atoi(entry[0]); + auto complexId = Util::fast_atoi(entry[2]); + chainKeyToComplexIdLookup.emplace(chainKey, complexId); + + std::string chainName(entry[1], (entry[2] - entry[1]) - 1); + size_t lastUnderscoreIndex = chainName.find_last_of('_'); + std::string complexName = chainName.substr(0, lastUnderscoreIndex); + + if (complexId != prevComplexId) { + + Complex complex; + complex.complexId = complexId; + complex.complexName = complexName; + complexIdtoIdx.emplace(complexId, nComplex); + complexes.emplace_back(complex); + + prevComplexId = complexId; + nComplex++; + } + complexes.back().chainKeys.emplace_back(chainKey); + complexes.back().nChain++; + data = Util::skipLine(data); + } + lookupDB.close(); +} + +int filtermultimer(int argc, const char **argv, const Command &command) { + LocalParameters &par = LocalParameters::getLocalInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; + const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); + int dbaccessMode = (DBReader::USE_INDEX); + + IndexReader* qDbr = NULL; + qDbr = new IndexReader(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + DBReader qStructDbr((par.db1 + "_ca").c_str(), (par.db1 + "_ca.index").c_str(), + par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + qStructDbr.open(DBReader::NOSORT); + + IndexReader* tDbr = NULL; + DBReader *tStructDbr = NULL; + if (sameDB) { + tDbr = qDbr; + tStructDbr = &qStructDbr; + } + else{ + tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + tStructDbr = new DBReader((par.db2 + "_ca").c_str(), (par.db2 + "_ca.index").c_str(), + par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + tStructDbr->open(DBReader::NOSORT); + } + DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX| DBReader::USE_DATA); + alnDbr.open(DBReader::LINEAR_ACCCESS); + size_t localThreads = 1; +#ifdef OPENMP +localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); +#endif + const bool shouldCompress = (par.compressed == true); + const int db4Type = Parameters::DBTYPE_CLUSTER_RES; + DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), par.threads, shouldCompress, db4Type); + resultWriter.open(); + const int db5Type = Parameters::DBTYPE_GENERIC_DB; + DBWriter resultWrite5((par.db4 + "_info").c_str(), (par.db4 + "_info.index").c_str(), par.threads, shouldCompress, db5Type); + resultWrite5.open(); + + std::string qLookupFile = par.db1 + ".lookup"; + std::string tLookupFile = par.db2 + ".lookup"; + + std::vector qComplexes, tComplexes; + std::map qComplexIdToIdx, tComplexIdToIdx; + chainKeyToComplexId_t qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; + + getlookupInfo(qLookupFile, qChainKeyToComplexIdMap, qComplexes, qComplexIdToIdx); + getComplexResidueLength(qDbr, qComplexes); + Debug::Progress progress(qComplexes.size()); + + if (sameDB) { + tChainKeyToComplexIdMap = qChainKeyToComplexIdMap; + tComplexes = qComplexes; + tComplexIdToIdx = qComplexIdToIdx; + } else { + getlookupInfo(tLookupFile, tChainKeyToComplexIdMap, tComplexes, tComplexIdToIdx); + getComplexResidueLength(tDbr, tComplexes); + } + // std::vector qComplexOrder(qComplexes.size()); + // for (size_t qComplexIdx = 0; qComplexIdx < qComplexes.size(); qComplexIdx++) { + // qComplexOrder[qComplexIdx] = qComplexIdx; + // } + // std::sort(qComplexOrder.begin(), qComplexOrder.end(), [&qComplexes](unsigned int lhs, unsigned int rhs) { + // return qComplexes[lhs].chainKeys.size() > qComplexes[rhs].chainKeys.size(); + // }); + +#pragma omp parallel num_threads(localThreads) + { + char buffer[32]; + char buffer2[4096]; + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = static_cast(omp_get_thread_num()); +#endif + resultToWrite_t result; + std::map localComplexMap; + // std::vector< ComplexFilterCriteria> localComplexVector; + std::map> cmplIdToBestAssId; + // std::vector cmpltargetIds; + // std::vector targetIdBestTm; + std::vector selectedAssIDs; + Coordinate16 qcoords; + Coordinate16 tcoords; + + Matcher::result_t res; +#pragma omp for schedule(dynamic, 1) + for (size_t qComplexIdx = 0; qComplexIdx < qComplexes.size(); qComplexIdx++) { + // for (size_t qComplexIdx : qComplexOrder) { + progress.updateProgress(); + Complex qComplex = qComplexes[qComplexIdx]; + unsigned int qComplexId = qComplex.complexId; + std::vector qChainKeys = qComplex.chainKeys; + for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { + unsigned int qChainKey = qChainKeys[qChainIdx]; + unsigned int qChainAlnId = alnDbr.getId(qChainKey); + unsigned int qChainDbId = qDbr->sequenceReader->getId(qChainKey); + // Handling monomer as singleton + if (qChainAlnId == NOT_AVAILABLE_CHAIN_KEY) { + break; + } + + char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); + size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); + size_t qChainLen = qDbr->sequenceReader->getSeqLen(qChainDbId); + float* qdata = qcoords.read(qcadata, qChainLen, qCaLength); + + char *data = alnDbr.getData(qChainAlnId, thread_idx); + while (*data != '\0' ) { + ComplexDataHandler retComplex = parseScoreComplexResult(data, res); + + if (!retComplex.isValid) { + Debug(Debug::ERROR) << "No scorecomplex result provided"; + EXIT(EXIT_FAILURE); + } + + data = Util::skipLine(data); + unsigned int assId = retComplex.assId; + unsigned int tChainKey = res.dbKey; + unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); + unsigned int tComplexId = tChainKeyToComplexIdMap.at(tChainKey); + unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); + std::vector tChainKeys = tComplexes[tComplexIdx].chainKeys; + //if target is monomer, but user doesn't want, continue + unsigned int tChainAlnId = alnDbr.getId(tChainKey); + if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY) { + continue; + } + float u[3][3]; + float t[3]; + fillUArr(retComplex.uString, u); + fillTArr(retComplex.tString, t); + unsigned int qalnlen = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + unsigned int talnlen = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); + // if (localComplexVector.size() <= assId) { + // ComplexFilterCriteria cmplfiltcrit(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); + // size_t subt = assId - localComplexVector.size(); + // for (size_t sub=0; sub < subt; sub ++) { + // localComplexVector.push_back(cmplfiltcrit); + // } + // localComplexVector.push_back(cmplfiltcrit); + // } + if (localComplexMap.find(assId) == localComplexMap.end()) { + ComplexFilterCriteria cmplfiltcrit(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); + localComplexMap[assId] = cmplfiltcrit; + } + ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); + // ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); + cmplfiltcrit.updateAln(qalnlen, talnlen); + + // save Aligned coordinatese if needed : chainTmThr & lddtThr + if (par.filtChainTmThr > 0.0f || par.filtInterfaceLddtThr > 0.0f) { + char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); + size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); + float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); + + unsigned int alnLen = cigarToAlignedLength(res.backtrace); + cmplfiltcrit.fillChainAlignment(qChainKey, tChainKey, alnLen, qdata, tdata, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); + // if (par.filtChainTmThr > 0.0f) { + double chainTm = computeChainTmScore(cmplfiltcrit.qAlnChains.back(), cmplfiltcrit.tAlnChains.back(), t, u, res.dbLen); + cmplfiltcrit.updateChainTmScore(chainTm / res.qLen, chainTm / res.dbLen); + // } + } + } // while end + } + + // Filter the target complexes and get the best alignment + // for (unsigned int assId = 0; assId < localComplexVector.size(); assId++) { + for (auto& assId_res : localComplexMap) { + unsigned int tComplexId = assId_res.second.targetComplexId; + // unsigned int tComplexId = localComplexVector.at(assId).targetComplexId; + + unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); + Complex tComplex = tComplexes[tComplexIdx]; + + ComplexFilterCriteria &cmplfiltcrit = assId_res.second; + // ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); + cmplfiltcrit.calcCov(qComplex.complexLength, tComplex.complexLength); + + if (par.filtInterfaceLddtThr > 0.0) { + cmplfiltcrit.computeInterfaceLddt(); + } + + // Check if the criteria are met + if (!(cmplfiltcrit.satisfy(par.covMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, par.filtInterfaceLddtThr, qComplex.nChain, tComplex.nChain))) { + continue; + } + unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); + // Get the best alignement per each target complex + if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()) { + cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; + // cmplIdToBestAssId[tComplexId] = {static_cast(assId), cmplfiltcrit.avgTm}; + } else { + if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]) { + cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + } + // if (cmplfiltcrit.avgTm > cmplIdToBestAssId.at(tComplexId)[1]) { + // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; + // // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res), cmplfiltcrit.avgTm}; + // } + } + + // unsigned int targetindex; + // auto it = std::find(cmpltargetIds.begin(), cmpltargetIds.end(), tComplexId); + // if ( it == cmpltargetIds.end()) { + // cmpltargetIds.push_back(tComplexId); + // selectedAssIDs.push_back(assId); + // targetIdBestTm.push_back(cmplfiltcrit.avgTm); + // } else { + // targetindex = std::distance(cmpltargetIds.begin(), it); + // if (cmplfiltcrit.avgTm > targetIdBestTm[targetindex]) { + // targetIdBestTm[targetindex] = cmplfiltcrit.avgTm; + // selectedAssIDs[targetindex] = assId; + // } + // } + + } + + for (const auto& pair : cmplIdToBestAssId) { + selectedAssIDs.push_back(pair.second[0]); + } + resultWrite5.writeStart(thread_idx); + for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++) { + unsigned int assId = selectedAssIDs.at(assIdidx); + ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); + // ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); + unsigned int tComplexId = cmplfiltcrit.targetComplexId; + unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); + Complex tComplex = tComplexes.at(tComplexIdx); + + char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); + result.append(buffer, (outpos - buffer - 1)); + result.push_back('\n'); + + char * tmpBuff = Itoa::u32toa_sse2(tComplexId, buffer2); + tmpBuff = filterToBuffer(cmplfiltcrit, tmpBuff); + resultWrite5.writeAdd(buffer2, tmpBuff - buffer2, thread_idx); + } + if (selectedAssIDs.size() == 0) { + float t[3]; + float u[3][3]; + for (int i=0; i < 3; i++) { + t[i] = 0.0; + } + for (int i=0; i < 3; i++) { + for (int j=0; j < 3; j++) { + u[i][j] = 0.0; + } + } + ComplexFilterCriteria cmplfiltcrit(qComplexId, 1.0, 1.0, t, u); + cmplfiltcrit.qCov = 1.0; + cmplfiltcrit.tCov = 1.0; + cmplfiltcrit.interfaceLddt = 1.0; + resultWrite5.writeStart(thread_idx); + char * tmpBuff = Itoa::u32toa_sse2(qComplexId, buffer2); + tmpBuff = filterToBuffer(cmplfiltcrit, tmpBuff); + resultWrite5.writeAdd(buffer2, tmpBuff - buffer2, thread_idx); + + char *outpos = Itoa::u32toa_sse2(qComplexId, buffer); + result.append(buffer, (outpos - buffer - 1)); + result.push_back('\n'); + } + // if (qComplexId == 1) { + // Debug(Debug::WARNING)<< "hi\n"; + // } + resultWriter.writeData(result.c_str(), result.length(), qComplexId, thread_idx); + resultWrite5.writeEnd(qComplexId, thread_idx); + result.clear(); + localComplexMap.clear(); + cmplIdToBestAssId.clear(); + selectedAssIDs.clear(); + // localComplexVector.clear(); + // cmpltargetIds.clear(); + // targetIdBestTm.clear(); + } // for end + } // MP end + + resultWriter.close(true); + resultWrite5.close(true); + qStructDbr.close(); + alnDbr.close(); + delete qDbr; + if (sameDB == false) { + delete tDbr; + delete tStructDbr; + } + qChainKeyToComplexIdMap.clear(); + tChainKeyToComplexIdMap.clear(); + qComplexes.clear(); + tComplexes.clear(); + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/src/strucclustutils/scoremultimer.cpp b/src/strucclustutils/scoremultimer.cpp index ca7c810..6e8f95a 100644 --- a/src/strucclustutils/scoremultimer.cpp +++ b/src/strucclustutils/scoremultimer.cpp @@ -10,7 +10,7 @@ #include "Coordinate16.h" #include "MultimerUtil.h" #include "set" - +#include "unordered_set" #ifdef OPENMP #include #endif @@ -31,10 +31,13 @@ struct SearchResult { dbResidueLen = residueLen; } - void standardize() { + void standardize(int MonomerIncludeMode) { if (dbResidueLen == 0) alnVec.clear(); + if (MonomerIncludeMode == SKIP_MONOMERS && dbChainKeys.size() < MULTIPLE_CHAINED_COMPLEX) + alnVec.clear(); + if (alnVec.empty()) return; @@ -179,22 +182,27 @@ bool compareNeighborWithDist(const NeighborsWithDist &first, const NeighborsWith class DBSCANCluster { public: - DBSCANCluster(SearchResult &searchResult, std::set &finalClusters, double minCov) : searchResult(searchResult), finalClusters(finalClusters) { + DBSCANCluster(SearchResult &searchResult, std::set &finalClusters, float minCov) : searchResult(searchResult), finalClusters(finalClusters) { cLabel = 0; - clusterSizeThr = std::max(MULTIPLE_CHAINED_COMPLEX, (unsigned int) ((double) searchResult.qChainKeys.size() * minCov)); - idealClusterSize = std::min(searchResult.qChainKeys.size(), searchResult.dbChainKeys.size()); + minimumClusterSize = std::ceil((float) searchResult.qChainKeys.size() * minCov); + maximumClusterSize = std::min(searchResult.qChainKeys.size(), searchResult.dbChainKeys.size()); + maximumClusterNum = searchResult.alnVec.size() / maximumClusterSize; prevMaxClusterSize = 0; - maxDist = 0; - eps = DEFAULT_EPS; + maxDist = FLT_MIN; + minDist = FLT_MAX; learningRate = LEARNING_RATE; } bool getAlnClusters() { + // if Query or Target is a Monomer Complex. + if (std::min(searchResult.qChainKeys.size(), searchResult.dbChainKeys.size()) < MULTIPLE_CHAINED_COMPLEX) + return earlyStopForMonomers(); + // rbh filter filterAlnsByRBH(); - fillDistMap(); + fillDistMatrix(); // To skip DBSCAN clustering when alignments are few enough. - if (searchResult.alnVec.size() <= idealClusterSize) + if (searchResult.alnVec.size() <= maximumClusterSize) return checkClusteringNecessity(); return runDBSCAN(); @@ -204,101 +212,139 @@ class DBSCANCluster { SearchResult &searchResult; float eps; float maxDist; + float minDist; float learningRate; unsigned int cLabel; + unsigned int maximumClusterNum; unsigned int prevMaxClusterSize; - unsigned int maxClusterSize; - unsigned int idealClusterSize; - unsigned int clusterSizeThr; + unsigned int currMaxClusterSize; + unsigned int maximumClusterSize; + unsigned int minimumClusterSize; std::vector neighbors; std::vector neighborsOfCurrNeighbor; + std::unordered_set foundNeighbors; std::vector neighborsWithDist; - std::set qFoundChainKeys; - std::set dbFoundChainKeys; - distMap_t distMap; + std::unordered_set qFoundChainKeys; + std::unordered_set dbFoundChainKeys; + std::vector distMatrix; std::vector currClusters; std::set &finalClusters; std::map qBestTmScore; std::map dbBestTmScore; - bool runDBSCAN() { - initializeAlnLabels(); - if (eps >= maxDist) + bool earlyStopForMonomers() { + if (minimumClusterSize >= MULTIPLE_CHAINED_COMPLEX) return finishDBSCAN(); - for (size_t centerAlnIdx=0; centerAlnIdx < searchResult.alnVec.size(); centerAlnIdx++) { - ChainToChainAln ¢erAln = searchResult.alnVec[centerAlnIdx]; - if (centerAln.label != 0) - continue; + getSingleChainedCluster(); + return finishDBSCAN(); + } - getNeighbors(centerAlnIdx, neighbors); - if (neighbors.size() < MIN_PTS) - continue; + void getSingleChainedCluster() { + finalClusters.clear(); + for (unsigned int alnIdx = 0; alnIdx < searchResult.alnVec.size(); alnIdx++ ) { + neighbors = {alnIdx}; + finalClusters.insert(neighbors); + } + } + + bool runDBSCAN() { + unsigned int neighborIdx; + unsigned int neighborAlnIdx; + while (eps < maxDist) { + initializeAlnLabels(); + for (size_t centerAlnIdx = 0; centerAlnIdx < searchResult.alnVec.size(); centerAlnIdx++) { + ChainToChainAln ¢erAln = searchResult.alnVec[centerAlnIdx]; + if (centerAln.label != 0) + continue; - centerAln.label = ++cLabel; - size_t neighborIdx = 0; - while (neighborIdx < neighbors.size()) { - unsigned int neighborAlnIdx = neighbors[neighborIdx++]; - if (centerAlnIdx == neighborAlnIdx) + getNeighbors(centerAlnIdx, neighbors); + if (neighbors.size() < MIN_PTS) continue; - ChainToChainAln &neighborAln = searchResult.alnVec[neighborAlnIdx]; - neighborAln.label = cLabel; - getNeighbors(neighborAlnIdx, neighborsOfCurrNeighbor); - if (neighborsOfCurrNeighbor.size() < MIN_PTS) + centerAln.label = ++cLabel; + foundNeighbors.clear(); + foundNeighbors.insert(neighbors.begin(), neighbors.end()); + neighborIdx = 0; + while (neighborIdx < neighbors.size()) { + neighborAlnIdx = neighbors[neighborIdx++]; + if (centerAlnIdx == neighborAlnIdx) + continue; + + ChainToChainAln &neighborAln = searchResult.alnVec[neighborAlnIdx]; + neighborAln.label = cLabel; + getNeighbors(neighborAlnIdx, neighborsOfCurrNeighbor); + if (neighborsOfCurrNeighbor.size() < MIN_PTS) + continue; + + for (auto neighbor : neighborsOfCurrNeighbor) { + if (foundNeighbors.insert(neighbor).second) + neighbors.emplace_back(neighbor); + } + } + if (neighbors.size() > maximumClusterSize || checkChainRedundancy()) + getNearestNeighbors(centerAlnIdx); + + // too small cluster + if (neighbors.size() < currMaxClusterSize) continue; - for (auto neighbor : neighborsOfCurrNeighbor) { - if (std::find(neighbors.begin(), neighbors.end(), neighbor) == neighbors.end()) - neighbors.emplace_back(neighbor); + // new Biggest cluster + if (neighbors.size() > currMaxClusterSize) { + currMaxClusterSize = neighbors.size(); + currClusters.clear(); } + SORT_SERIAL(neighbors.begin(), neighbors.end()); + currClusters.emplace_back(neighbors); } - if (neighbors.size() > idealClusterSize || checkChainRedundancy()) - getNearestNeighbors(centerAlnIdx); - // too small cluster - if (neighbors.size() < maxClusterSize) - continue; + if (!finalClusters.empty() && currClusters.empty()) + return finishDBSCAN(); + + if (currMaxClusterSize < prevMaxClusterSize) + return finishDBSCAN(); - // new Biggest cluster - if (neighbors.size() > maxClusterSize) { - maxClusterSize = neighbors.size(); - currClusters.clear(); + if (currMaxClusterSize > prevMaxClusterSize) { + finalClusters.clear(); + prevMaxClusterSize = currMaxClusterSize; } - SORT_SERIAL(neighbors.begin(), neighbors.end()); - currClusters.emplace_back(neighbors); - } - if (!finalClusters.empty() && currClusters.empty()) - return finishDBSCAN(); + if (currMaxClusterSize >= minimumClusterSize) + finalClusters.insert(currClusters.begin(), currClusters.end()); - if (maxClusterSize < prevMaxClusterSize) - return finishDBSCAN(); + if (currMaxClusterSize == maximumClusterSize && finalClusters.size() == maximumClusterNum) + return finishDBSCAN(); - if (maxClusterSize > prevMaxClusterSize) { - finalClusters.clear(); - prevMaxClusterSize = maxClusterSize; + eps += learningRate; } - if (maxClusterSize >= clusterSizeThr) - finalClusters.insert(currClusters.begin(), currClusters.end()); + if (minimumClusterSize < MULTIPLE_CHAINED_COMPLEX && prevMaxClusterSize < MULTIPLE_CHAINED_COMPLEX) + getSingleChainedCluster(); - eps += learningRate; - return runDBSCAN(); + return finishDBSCAN(); + } + + size_t getDistMatrixIndex(size_t i, size_t j) const { + if (i > j) std::swap(i, j); // Ensure i <= j for symmetry + size_t n = searchResult.alnVec.size(); + return (2 * n *i - i - i * i) / 2 + j - i - 1; } - void fillDistMap() { + void fillDistMatrix() { + size_t size = searchResult.alnVec.size(); float dist; - distMap.clear(); - for (size_t i=0; i < searchResult.alnVec.size(); i++) { - ChainToChainAln &prevAln = searchResult.alnVec[i]; - for (size_t j = i+1; j < searchResult.alnVec.size(); j++) { - ChainToChainAln &currAln = searchResult.alnVec[j]; + distMatrix.resize(size * (size - 1) / 2, 0.0f); + for (size_t i = 0; i < searchResult.alnVec.size(); i++) { + const ChainToChainAln &prevAln = searchResult.alnVec[i]; + for (size_t j = i + 1; j < searchResult.alnVec.size(); j++) { + const ChainToChainAln &currAln = searchResult.alnVec[j]; dist = prevAln.getDistance(currAln); maxDist = std::max(maxDist, dist); - distMap.insert({{i,j}, dist}); + minDist = std::min(minDist, dist); + distMatrix[getDistMatrixIndex(i, j)] = dist; } } + eps = minDist; } void getNeighbors(size_t centerIdx, std::vector &neighborVec) { @@ -309,7 +355,7 @@ class DBSCANCluster { if (neighborIdx == centerIdx) continue; - if (distMap[{std::min(centerIdx, neighborIdx), std::max(centerIdx, neighborIdx)}] >= eps) + if (distMatrix[getDistMatrixIndex(centerIdx, neighborIdx)] >= eps) continue; neighborVec.emplace_back(neighborIdx); @@ -321,7 +367,7 @@ class DBSCANCluster { aln.label = INITIALIZED_LABEL; } cLabel = INITIALIZED_LABEL; - maxClusterSize = 0; + currMaxClusterSize = 0; currClusters.clear(); } @@ -341,8 +387,9 @@ class DBSCANCluster { bool checkClusteringNecessity() { // Too few alns => do nothing and finish it - if (searchResult.alnVec.size() < clusterSizeThr) + if (searchResult.alnVec.size() < minimumClusterSize) return finishDBSCAN(); + // All alns as a cluster for (size_t alnIdx=0; alnIdx finish it without clustering - prevMaxClusterSize = neighbors.size(); finalClusters.insert(neighbors); return finishDBSCAN(); } @@ -366,15 +412,7 @@ class DBSCANCluster { dbBestTmScore.clear(); qFoundChainKeys.clear(); dbFoundChainKeys.clear(); - distMap.clear(); -// auto it = finalClusters.begin(); -// while (it != finalClusters.end()) { -// if (it->size() < clusterSizeThr) { -// it = finalClusters.erase(it); -// continue; -// } -// it++; -// } + distMatrix.clear(); return !finalClusters.empty(); } @@ -387,17 +425,17 @@ class DBSCANCluster { qFoundChainKeys.clear(); dbFoundChainKeys.clear(); for (auto qChainKey: searchResult.qChainKeys) { - qBestTmScore.insert({qChainKey, DEF_TM_SCORE}); + qBestTmScore.insert({qChainKey, FLT_MIN}); } for (auto dbChainKey: searchResult.dbChainKeys) { - dbBestTmScore.insert({dbChainKey, DEF_TM_SCORE}); + dbBestTmScore.insert({dbChainKey, FLT_MIN}); } for (auto &aln: searchResult.alnVec) { qKey = aln.qChain.chainKey; dbKey = aln.dbChain.chainKey; tmScore = aln.tmScore; - qBestTmScore[qKey] = qBestTmScore[qKey] < UNINITIALIZED ? tmScore : std::max(tmScore, qBestTmScore[qKey]); - dbBestTmScore[dbKey] = dbBestTmScore[dbKey] < UNINITIALIZED ? tmScore : std::max(tmScore, dbBestTmScore[dbKey]); + qBestTmScore[qKey] = std::max(tmScore, qBestTmScore[qKey]); + dbBestTmScore[dbKey] = std::max(tmScore, dbBestTmScore[dbKey]); } size_t alnIdx = 0; while (alnIdx < searchResult.alnVec.size()) { @@ -413,7 +451,7 @@ class DBSCANCluster { alnIdx ++; } - if (std::min(qFoundChainKeys.size(), dbFoundChainKeys.size()) < clusterSizeThr) + if (std::min(qFoundChainKeys.size(), dbFoundChainKeys.size()) < minimumClusterSize) searchResult.alnVec.clear(); } @@ -425,7 +463,7 @@ class DBSCANCluster { for (auto neighborIdx: neighbors) { if (neighborIdx == centerIdx) continue; - neighborsWithDist.emplace_back(neighborIdx, distMap[{std::min(centerIdx, neighborIdx), std::max(centerIdx, neighborIdx)}]); + neighborsWithDist.emplace_back(neighborIdx, distMatrix[getDistMatrixIndex(centerIdx, neighborIdx)]); } SORT_SERIAL(neighborsWithDist.begin(), neighborsWithDist.end(), compareNeighborWithDist); neighbors.clear(); @@ -441,7 +479,7 @@ class DBSCANCluster { class ComplexScorer { public: - ComplexScorer(IndexReader *qDbr3Di, IndexReader *tDbr3Di, DBReader &alnDbr, IndexReader *qCaDbr, IndexReader *tCaDbr, unsigned int thread_idx, double minAssignedChainsRatio) : alnDbr(alnDbr), qCaDbr(qCaDbr), tCaDbr(tCaDbr), thread_idx(thread_idx), minAssignedChainsRatio(minAssignedChainsRatio) { + ComplexScorer(IndexReader *qDbr3Di, IndexReader *tDbr3Di, DBReader &alnDbr, IndexReader *qCaDbr, IndexReader *tCaDbr, unsigned int thread_idx, float minAssignedChainsRatio, int monomerIncludeMode) : alnDbr(alnDbr), qCaDbr(qCaDbr), tCaDbr(tCaDbr), thread_idx(thread_idx), minAssignedChainsRatio(minAssignedChainsRatio), monomerIncludeMode(monomerIncludeMode) { maxChainLen = std::max(qDbr3Di->sequenceReader->getMaxSeqLen()+1, tDbr3Di->sequenceReader->getMaxSeqLen()+1); q3diDbr = qDbr3Di; t3diDbr = tDbr3Di; @@ -507,8 +545,8 @@ class ComplexScorer { paredSearchResult.alnVec.emplace_back(aln); continue; } - paredSearchResult.standardize(); - if (!paredSearchResult.alnVec.empty() && currDbChainKeys.size() >= MULTIPLE_CHAINED_COMPLEX) + paredSearchResult.standardize(monomerIncludeMode); + if (!paredSearchResult.alnVec.empty()) searchResults.emplace_back(paredSearchResult); paredSearchResult.alnVec.clear(); @@ -519,8 +557,8 @@ class ComplexScorer { paredSearchResult.alnVec.emplace_back(aln); } currAlns.clear(); - paredSearchResult.standardize(); - if (!paredSearchResult.alnVec.empty() && currDbChainKeys.size() >= MULTIPLE_CHAINED_COMPLEX) + paredSearchResult.standardize(monomerIncludeMode); + if (!paredSearchResult.alnVec.empty()) searchResults.emplace_back(paredSearchResult); paredSearchResult.alnVec.clear(); @@ -533,7 +571,7 @@ class ComplexScorer { tmAligner = new TMaligner(maxResLen, false, true, false); } finalClusters.clear(); - DBSCANCluster dbscanCluster = DBSCANCluster(searchResult, finalClusters, minAssignedChainsRatio); + DBSCANCluster dbscanCluster(searchResult, finalClusters, minAssignedChainsRatio); if (!dbscanCluster.getAlnClusters()) { finalClusters.clear(); return; @@ -569,7 +607,7 @@ class ComplexScorer { Coordinate16 qCoords; Coordinate16 tCoords; unsigned int thread_idx; - double minAssignedChainsRatio; + float minAssignedChainsRatio; unsigned int maxResLen; Chain qChain; Chain dbChain; @@ -579,6 +617,7 @@ class ComplexScorer { SearchResult paredSearchResult; std::set finalClusters; bool hasBacktrace; + int monomerIncludeMode; unsigned int getQueryResidueLength(std::vector &qChainKeys) { unsigned int qResidueLen = 0; @@ -671,7 +710,8 @@ int scoremultimer(int argc, const char **argv, const Command &command) { ); } - double minAssignedChainsRatio = par.minAssignedChainsThreshold > MAX_ASSIGNED_CHAIN_RATIO ? MAX_ASSIGNED_CHAIN_RATIO: par.minAssignedChainsThreshold; + float minAssignedChainsRatio = par.minAssignedChainsThreshold > MAX_ASSIGNED_CHAIN_RATIO ? MAX_ASSIGNED_CHAIN_RATIO: par.minAssignedChainsThreshold; + int monomerIncludeMode = par.monomerIncludeMode; std::vector qComplexIndices; std::vector dbComplexIndices; @@ -697,13 +737,13 @@ int scoremultimer(int argc, const char **argv, const Command &command) { std::vector searchResults; std::vector assignments; std::vector resultToWriteLines; - ComplexScorer complexScorer(q3DiDbr, &t3DiDbr, alnDbr, qCaDbr, &tCaDbr, thread_idx, minAssignedChainsRatio); + ComplexScorer complexScorer(q3DiDbr, &t3DiDbr, alnDbr, qCaDbr, &tCaDbr, thread_idx, minAssignedChainsRatio, monomerIncludeMode); #pragma omp for schedule(dynamic, 1) // for each q complex for (size_t qCompIdx = 0; qCompIdx < qComplexIndices.size(); qCompIdx++) { unsigned int qComplexId = qComplexIndices[qCompIdx]; std::vector &qChainKeys = qComplexIdToChainKeysMap.at(qComplexId); - if (qChainKeys.size() < MULTIPLE_CHAINED_COMPLEX) + if (monomerIncludeMode == SKIP_MONOMERS && qChainKeys.size() < MULTIPLE_CHAINED_COMPLEX) continue; complexScorer.getSearchResults(qComplexId, qChainKeys, dbChainKeyToComplexIdMap, dbComplexIdToChainKeysMap, searchResults); // for each db complex diff --git a/src/strucclustutils/structcreatedb.cpp b/src/strucclustutils/structcreatedb.cpp index cf22421..43d5828 100644 --- a/src/strucclustutils/structcreatedb.cpp +++ b/src/strucclustutils/structcreatedb.cpp @@ -15,6 +15,8 @@ #include "PatternCompiler.h" #include "Coordinate16.h" #include "itoa.h" +#include "MathUtil.h" + #ifdef HAVE_PROSTT5 #include "prostt5.h" #endif @@ -79,39 +81,24 @@ static inline bool compareByFirst(const std::pair& a, const std::pair & alphabet3di, std::vector & alphabetAA, - std::vector & camol, std::string & header, - DBWriter & aadbw, DBWriter & hdbw, DBWriter & torsiondbw, DBWriter & cadbw, int chainNameMode, - float maskBfactorThreshold, size_t & tooShort, size_t & notProtein, size_t & globalCnt, int thread_idx, int coordStoreMode, - std::string & filename, size_t & fileidCnt, - std::map> & entrynameToFileId, - std::map & filenameToFileId, - std::map & fileIdToName, - DBWriter* mappingWriter) { - size_t id = __sync_fetch_and_add(&globalCnt, readStructure.chain.size()); - size_t entriesAdded = 0; + +void addMissingAtomsInStructure(GemmiWrapper &readStructure, PulchraWrapper &pulchra) { + std::vector chainNames; for (size_t ch = 0; ch < readStructure.chain.size(); ch++) { - size_t dbKey = id + ch; size_t chainStart = readStructure.chain[ch].first; size_t chainEnd = readStructure.chain[ch].second; size_t chainLen = chainEnd - chainStart; - if (chainLen <= 3) { - tooShort++; - continue; - } - bool allX = true; for (size_t pos = 0; pos < chainLen; pos++) { const char aa = readStructure.ami[chainStart+pos]; @@ -121,10 +108,10 @@ writeStructureEntry(SubstitutionMatrix & mat, GemmiWrapper & readStructure, Stru } } if (allX) { - notProtein++; + chainNames.push_back("SKIP"); continue; } - + chainNames.push_back(readStructure.chainNames[ch]); // Detect if structure is Ca only if (std::isnan(readStructure.n[chainStart + 0].x) && std::isnan(readStructure.n[chainStart + 1].x) && @@ -133,73 +120,366 @@ writeStructureEntry(SubstitutionMatrix & mat, GemmiWrapper & readStructure, Stru std::isnan(readStructure.c[chainStart + 0].x) && std::isnan(readStructure.c[chainStart + 1].x) && std::isnan(readStructure.c[chainStart + 2].x) && - std::isnan(readStructure.c[chainStart + 3].x)) - { + std::isnan(readStructure.c[chainStart + 3].x)) { pulchra.rebuildBackbone(&readStructure.ca[chainStart], &readStructure.n[chainStart], &readStructure.c[chainStart], &readStructure.ami[chainStart], chainLen); + + } + } + readStructure.chainNames.clear(); + readStructure.chainNames.insert(readStructure.chainNames.begin(), chainNames.begin(), chainNames.end()); + chainNames.clear(); +} + +void findInterfaceResidues(GemmiWrapper &readStructure, std::pair res1, std::pair res2, + std::vector & resIdx1, float distanceThreshold) +{ + std::vector coord1, coord2; + size_t sameRes = 0; + size_t chainLen = res1.second - res1.first; + bool noSameRes = true; + const float squareThreshold = distanceThreshold * distanceThreshold; + for (size_t res1Idx = res1.first; res1Idx < res1.second; res1Idx++) { + float x1, y1, z1; + if (readStructure.ami[res1Idx] == 'G') { + x1 = readStructure.ca[res1Idx].x; + y1 = readStructure.ca[res1Idx].y; + z1 = readStructure.ca[res1Idx].z; + } + else { + x1 = readStructure.cb[res1Idx].x; + y1 = readStructure.cb[res1Idx].y; + z1 = readStructure.cb[res1Idx].z; } + for (size_t res2Idx = res2.first; res2Idx < res2.second; res2Idx++) { + float x2, y2, z2; + if (readStructure.ami[res2Idx] == 'G') { + x2 = readStructure.ca[res2Idx].x; + y2 = readStructure.ca[res2Idx].y; + z2 = readStructure.ca[res2Idx].z; + } + else { + x2 = readStructure.cb[res2Idx].x; + y2 = readStructure.cb[res2Idx].y; + z2 = readStructure.cb[res2Idx].z; + } + float distance = MathUtil::squareDist(x1, y1, z1, x2, y2, z2); + if (distance < 0.01) { + noSameRes = false; + sameRes++; + break; + } + if (distance < squareThreshold) { + resIdx1.push_back(res1Idx); + if (noSameRes) { + break; + } + } + } + } + if (sameRes / chainLen > 0.9){ + resIdx1.clear(); + } +} + +void compute3DiInterfaces(GemmiWrapper &readStructure, PulchraWrapper &pulchra, StructureTo3Di &structureTo3Di, SubstitutionMatrix & mat3Di, int chainNameMode, float distanceThreshold) { + size_t prevInterfaceChainLen = 0; + std::vector interfaceSeq3di, interfaceAmi; + std::vector resIdx1, resIdx2; + std::vector notProteinChains; + std::vector interfacetaxIds; + std::vector interfaceModelIndices; + std::vector ca, n, c, cb; + std::vector interfaceCa; + std::vector interfaceNames, interfaceChainNames; + std::vector> interfaceChain; + std::unordered_map modelToInterfaceNum; + addMissingAtomsInStructure(readStructure, pulchra); + for (size_t ch1 = 0; ch1 < readStructure.chain.size(); ch1++) { + if (readStructure.chainNames[ch1] == "SKIP") { + interfaceCa.push_back(Vec3(0,0,0)); + interfaceAmi.push_back('X'); + interfaceSeq3di.push_back('X'); + interfaceChain.push_back(std::make_pair(prevInterfaceChainLen, prevInterfaceChainLen+1)); + interfaceNames.push_back("ALLX"); + interfaceChainNames.push_back(readStructure.chainNames[ch1]); + interfacetaxIds.push_back(readStructure.taxIds[ch1]); + interfaceModelIndices.push_back(readStructure.modelIndices[ch1]); + prevInterfaceChainLen++; + continue; + } + for (size_t ch2 = ch1 + 1; ch2 < readStructure.chain.size(); ch2++) { + if (readStructure.chainNames[ch2] == "SKIP") { + continue; + } + if (readStructure.modelIndices[ch1] == readStructure.modelIndices[ch2]) { + findInterfaceResidues(readStructure, readStructure.chain[ch1], readStructure.chain[ch2], resIdx1, distanceThreshold); + findInterfaceResidues(readStructure, readStructure.chain[ch2], readStructure.chain[ch1], resIdx2, distanceThreshold); + if (resIdx1.size() >= 4 && resIdx2.size() >= 4) { + modelToInterfaceNum[readStructure.modelIndices[ch2]]++; + for (size_t i = 0; i < resIdx1.size(); i++) { + ca.push_back(readStructure.ca[resIdx1[i]]); + n.push_back(readStructure.n[resIdx1[i]]); + c.push_back(readStructure.c[resIdx1[i]]); + cb.push_back(readStructure.cb[resIdx1[i]]); + } + // std::sort(resIdx2.begin(), resIdx2.end()); + for (size_t i = 0; i < resIdx2.size(); i++) { + ca.push_back(readStructure.ca[resIdx2[i]]); + n.push_back(readStructure.n[resIdx2[i]]); + c.push_back(readStructure.c[resIdx2[i]]); + cb.push_back(readStructure.cb[resIdx2[i]]); + } + char *states = structureTo3Di.structure2states(ca.data(), + n.data(), + c.data(), + cb.data(), + resIdx1.size() + resIdx2.size()); + for (size_t i = 0; i < resIdx1.size(); i++) { + interfaceSeq3di.push_back(mat3Di.num2aa[static_cast(states[i])]); + interfaceAmi.push_back(readStructure.ami[resIdx1[i]]); + interfaceCa.push_back(readStructure.ca[resIdx1[i]]); + } + for (size_t i = 0; i < resIdx2.size(); i++) { + interfaceSeq3di.push_back(mat3Di.num2aa[static_cast(states[resIdx1.size()+i])]); + interfaceAmi.push_back(readStructure.ami[resIdx2[i]]); + interfaceCa.push_back(readStructure.ca[resIdx2[i]]); + } + interfaceChain.push_back(std::make_pair(prevInterfaceChainLen, prevInterfaceChainLen + resIdx1.size())); + prevInterfaceChainLen += resIdx1.size(); + interfaceChain.push_back(std::make_pair(prevInterfaceChainLen, prevInterfaceChainLen + resIdx2.size())); + prevInterfaceChainLen += resIdx2.size(); + std::string interfaceName; + if (Util::endsWith(".gz", readStructure.names[ch1] )) { + interfaceName.append(Util::remove_extension(Util::remove_extension(readStructure.names[ch1]))); + } else { + interfaceName.append(Util::remove_extension(readStructure.names[ch1])); + } + if (readStructure.modelCount > 1) { + interfaceName.append("_MODEL_"); + interfaceName.append(SSTR(readStructure.modelIndices[ch1])); + } + interfaceModelIndices.push_back(readStructure.modelIndices[ch1]); + interfaceModelIndices.push_back(readStructure.modelIndices[ch2]); + interfaceName.append("_INT_"); + interfaceName.append(SSTR(modelToInterfaceNum[readStructure.modelIndices[ch2]])); + std::string interfaceNameFirst = interfaceName; + std::string interfaceNameSecond = interfaceName; + if (chainNameMode == LocalParameters::CHAIN_MODE_ADD || + (chainNameMode == LocalParameters::CHAIN_MODE_AUTO && readStructure.names.size() > 1)) { + interfaceNameFirst.push_back('_'); + interfaceNameFirst.append(readStructure.chainNames[ch1]); + interfaceNameSecond.push_back('_'); + interfaceNameSecond.append(readStructure.chainNames[ch2]); + } + if (readStructure.title.size() > 0) { + interfaceNameFirst.push_back(' '); + interfaceNameFirst.append(readStructure.title); + interfaceNameSecond.push_back(' '); + interfaceNameSecond.append(readStructure.title); + } + interfaceNames.push_back(interfaceNameFirst); + interfaceNames.push_back(interfaceNameSecond); + interfaceChainNames.push_back(readStructure.chainNames[ch1]); + interfaceChainNames.push_back(readStructure.chainNames[ch2]); + interfacetaxIds.push_back(readStructure.taxIds[ch1]); + interfacetaxIds.push_back(readStructure.taxIds[ch2]); + } + else { + interfaceCa.push_back(Vec3(0,0,0)); + interfaceAmi.push_back('X'); + interfaceSeq3di.push_back('X'); + interfaceChain.push_back(std::make_pair(prevInterfaceChainLen, prevInterfaceChainLen+1)); + interfaceNames.push_back("SHORT"); + interfaceChainNames.push_back(readStructure.chainNames[ch1]); + interfacetaxIds.push_back(readStructure.taxIds[ch1]); + interfaceModelIndices.push_back(readStructure.modelIndices[ch1]); + prevInterfaceChainLen++; + } + resIdx1.clear(); + resIdx2.clear(); + ca.clear(); + n.clear(); + c.clear(); + cb.clear(); + } + } + } + // copy interface data to readStructure + // this overwrites the original data (clear and push_back) + readStructure.ca.clear(); + readStructure.ca.insert(readStructure.ca.begin(), interfaceCa.begin(), interfaceCa.end()); + readStructure.ami.clear(); + readStructure.ami.insert(readStructure.ami.begin(), interfaceAmi.begin(), interfaceAmi.end()); + readStructure.seq3di.clear(); + readStructure.seq3di.insert(readStructure.seq3di.begin(), interfaceSeq3di.begin(), interfaceSeq3di.end()); + readStructure.chain.clear(); + readStructure.chain.insert(readStructure.chain.begin(), interfaceChain.begin(), interfaceChain.end()); + readStructure.names.clear(); + readStructure.names.insert(readStructure.names.begin(), interfaceNames.begin(), interfaceNames.end()); + readStructure.chainNames.clear(); + readStructure.chainNames.insert(readStructure.chainNames.begin(), interfaceChainNames.begin(), interfaceChainNames.end()); + readStructure.taxIds.clear(); + readStructure.taxIds.insert(readStructure.taxIds.begin(), interfacetaxIds.begin(), interfacetaxIds.end()); + readStructure.modelIndices.clear(); + readStructure.modelIndices.insert(readStructure.modelIndices.begin(), interfaceModelIndices.begin(), interfaceModelIndices.end()); +} + +size_t +writeStructureEntry(SubstitutionMatrix & mat, GemmiWrapper & readStructure, StructureTo3Di & structureTo3Di, + PulchraWrapper & pulchra, std::vector & alphabet3di, std::vector & alphabetAA, + std::vector & camol, std::string & header, + DBWriter & aadbw, DBWriter & hdbw, DBWriter & torsiondbw, DBWriter & cadbw, int chainNameMode, + float maskBfactorThreshold, size_t & tooShort, size_t & notProtein, size_t & globalCnt, int thread_idx, int coordStoreMode, + size_t & fileidCnt, std::map> & entrynameToFileId, + std::map & filenameToFileId, + std::map & fileIdToName, + DBWriter* mappingWriter) { + LocalParameters &par = LocalParameters::getLocalInstance(); + if (par.dbExtractionMode == LocalParameters::DB_EXTRACT_MODE_INTERFACE) { + compute3DiInterfaces(readStructure, pulchra, structureTo3Di, mat, chainNameMode, par.distanceThreshold); + } + size_t id = __sync_fetch_and_add(&globalCnt, readStructure.chain.size()); + size_t entriesAdded = 0; + for (size_t ch = 0; ch < readStructure.chain.size(); ch++) { + size_t dbKey = id + ch; + size_t chainStart = readStructure.chain[ch].first; + size_t chainEnd = readStructure.chain[ch].second; + size_t chainLen = chainEnd - chainStart; + header.clear(); + if (par.dbExtractionMode == LocalParameters::DB_EXTRACT_MODE_CHAIN) { + if (chainLen <= 3) { + tooShort++; + continue; + } + bool allX = true; + for (size_t pos = 0; pos < chainLen; pos++) { + const char aa = readStructure.ami[chainStart+pos]; + if (aa != 'X' && aa != 'x') { + allX = false; + break; + } + } + if (allX) { + notProtein++; + continue; + } + // Detect if structure is Ca only + if (std::isnan(readStructure.n[chainStart + 0].x) && + std::isnan(readStructure.n[chainStart + 1].x) && + std::isnan(readStructure.n[chainStart + 2].x) && + std::isnan(readStructure.n[chainStart + 3].x) && + std::isnan(readStructure.c[chainStart + 0].x) && + std::isnan(readStructure.c[chainStart + 1].x) && + std::isnan(readStructure.c[chainStart + 2].x) && + std::isnan(readStructure.c[chainStart + 3].x)) { + pulchra.rebuildBackbone(&readStructure.ca[chainStart], + &readStructure.n[chainStart], + &readStructure.c[chainStart], + &readStructure.ami[chainStart], + chainLen); - char * states = structureTo3Di.structure2states(&readStructure.ca[chainStart], - &readStructure.n[chainStart], - &readStructure.c[chainStart], - &readStructure.cb[chainStart], - chainLen); - for(size_t pos = 0; pos < chainLen; pos++){ - if(readStructure.ca_bfactor[pos] < maskBfactorThreshold){ - alphabet3di.push_back(tolower(mat.num2aa[static_cast(states[pos])])); - alphabetAA.push_back(tolower(readStructure.ami[chainStart+pos])); - }else{ - alphabet3di.push_back(mat.num2aa[static_cast(states[pos])]); + } + char * states = structureTo3Di.structure2states(&readStructure.ca[chainStart], + &readStructure.n[chainStart], + &readStructure.c[chainStart], + &readStructure.cb[chainStart], + chainLen); + for (size_t pos = 0; pos < chainLen; pos++) { + if (readStructure.ca_bfactor[pos] < maskBfactorThreshold) { + alphabet3di.push_back(tolower(mat.num2aa[static_cast(states[pos])])); + alphabetAA.push_back(tolower(readStructure.ami[chainStart+pos])); + } else { + alphabet3di.push_back(mat.num2aa[static_cast(states[pos])]); + alphabetAA.push_back(readStructure.ami[chainStart+pos]); + } + } + if (Util::endsWith(".gz", readStructure.names[ch] )) { + header.append(Util::remove_extension(Util::remove_extension(readStructure.names[ch]))); + } + else { + header.append(Util::remove_extension(readStructure.names[ch])); + } + if (readStructure.modelCount > 1) { + header.append("_MODEL_"); + header.append(SSTR(readStructure.modelIndices[ch])); + } + if (chainNameMode == LocalParameters::CHAIN_MODE_ADD || + (chainNameMode == LocalParameters::CHAIN_MODE_AUTO && readStructure.names.size() > 1)) { + header.push_back('_'); + header.append(readStructure.chainNames[ch]); + } + if (readStructure.title.size() > 0) { + header.push_back(' '); + header.append(readStructure.title); + } + } + else if (par.dbExtractionMode == LocalParameters::DB_EXTRACT_MODE_INTERFACE) { + if (readStructure.names[ch] == "ALLX") { + notProtein++; + continue; + } + if (readStructure.names[ch] == "SHORT") { + tooShort++; + continue; + } + for (size_t pos = 0; pos < chainLen; pos++) { + alphabet3di.push_back(readStructure.seq3di[chainStart+pos]); alphabetAA.push_back(readStructure.ami[chainStart+pos]); } + header.append(readStructure.names[ch]); } alphabet3di.push_back('\n'); alphabetAA.push_back('\n'); torsiondbw.writeData(alphabet3di.data(), alphabet3di.size(), dbKey, thread_idx); aadbw.writeData(alphabetAA.data(), alphabetAA.size(), dbKey, thread_idx); - header.clear(); - if (Util::endsWith(".gz", readStructure.names[ch])){ - header.append(Util::remove_extension(Util::remove_extension(readStructure.names[ch]))); - } - else{ - header.append(Util::remove_extension(readStructure.names[ch])); - } - if(readStructure.modelCount > 1){ - header.append("_MODEL_"); - header.append(std::to_string(readStructure.modelIndices[ch])); - } - if(chainNameMode == LocalParameters::CHAIN_MODE_ADD || - (chainNameMode == LocalParameters::CHAIN_MODE_AUTO && readStructure.names.size() > 1)){ - header.push_back('_'); - header.append(readStructure.chainNames[ch]); - } - if(readStructure.title.size() > 0){ - header.push_back(' '); - header.append(readStructure.title); - } header.push_back('\n'); std::string entryName = Util::parseFastaHeader(header.c_str()); #pragma omp critical { - std::string filenameWithExtension = filename; - if (Util::endsWith(".gz", filename)){ - filenameWithExtension = Util::remove_extension(filename); - } - std::string filenameWithoutExtension = Util::remove_extension(filenameWithExtension); - std::map::iterator it = filenameToFileId.find(filenameWithoutExtension); - size_t fileid; - if (it != filenameToFileId.end()) { - fileid = it->second; - } else { - fileid = fileidCnt; - filenameToFileId[filenameWithoutExtension] = fileid; - fileIdToName[fileid] = filenameWithoutExtension; - fileidCnt++; + if (par.dbExtractionMode == LocalParameters::DB_EXTRACT_MODE_CHAIN) { + std::string filenameWithExtension; + if (Util::endsWith(".gz", readStructure.names[ch] )) { + filenameWithExtension = Util::remove_extension(Util::remove_extension(readStructure.names[ch])); + } + else { + filenameWithExtension = Util::remove_extension(readStructure.names[ch]); + } + std::string filenameWithoutExtension = Util::remove_extension(filenameWithExtension); + std::map::iterator it = filenameToFileId.find(filenameWithoutExtension); + size_t fileid; + if (it != filenameToFileId.end()) { + fileid = it->second; + } else { + fileid = fileidCnt; + filenameToFileId[filenameWithoutExtension] = fileid; + fileIdToName[fileid] = filenameWithoutExtension; + fileidCnt++; + } + entrynameToFileId[entryName] = std::make_pair(fileid, readStructure.modelIndices[ch]); + } else if (par.dbExtractionMode == LocalParameters::DB_EXTRACT_MODE_INTERFACE) { + std::string filenameWithoutExtension; + if (chainNameMode == LocalParameters::CHAIN_MODE_ADD || chainNameMode == LocalParameters::CHAIN_MODE_AUTO) { + size_t firstUnderscore = readStructure.names[ch].find('_'); + filenameWithoutExtension = readStructure.names[ch].substr(0, firstUnderscore); + } else { + filenameWithoutExtension = readStructure.names[ch]; + } + std::map::iterator it = filenameToFileId.find(filenameWithoutExtension); + size_t fileid; + if (it != filenameToFileId.end()) { + fileid = it->second; + } else { + fileid = fileidCnt; + filenameToFileId[filenameWithoutExtension] = fileid; + fileIdToName[fileid] = filenameWithoutExtension; + fileidCnt++; + } + entrynameToFileId[entryName] = std::make_pair(fileid, readStructure.modelIndices[ch]); } - entrynameToFileId[entryName] = std::make_pair(fileid, readStructure.modelIndices[ch]); } hdbw.writeData(header.c_str(), header.size(), dbKey, thread_idx); @@ -487,7 +767,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { int inputFormat = par.inputFormat; // Process tar files! - for(size_t i = 0; i < tarFiles.size(); i++) { + for (size_t i = 0; i < tarFiles.size(); i++) { mtar_t tar; if (Util::endsWith(".tar.gz", tarFiles[i]) || Util::endsWith(".tgz", tarFiles[i])) { #ifdef HAVE_ZLIB @@ -508,7 +788,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { progress.updateProgress(); #ifdef OPENMP int localThreads = par.threads; - if(localThreads > 1){ + if (localThreads > 1) { needsReorderingAtTheEnd = true; } #endif @@ -649,7 +929,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { mat, readStructure, structureTo3Di, pulchra, alphabet3di, alphabetAA, camol, header, aadbw, hdbw, torsiondbw, cadbw, par.chainNameMode, par.maskBfactorThreshold, tooShort, notProtein, globalCnt, thread_idx, par.coordStoreMode, - name, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, + globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter ); } @@ -682,7 +962,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { for (size_t i = 0; i < looseFiles.size(); i++) { progress.updateProgress(); - if(readStructure.load(looseFiles[i], (GemmiWrapper::Format)inputFormat) == false){ + if (readStructure.load(looseFiles[i], (GemmiWrapper::Format)inputFormat) == false) { incorrectFiles++; continue; } @@ -692,7 +972,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { mat, readStructure, structureTo3Di, pulchra, alphabet3di, alphabetAA, camol, header, aadbw, hdbw, torsiondbw, cadbw, par.chainNameMode, par.maskBfactorThreshold, tooShort, notProtein, globalCnt, thread_idx, par.coordStoreMode, - looseFiles[i], globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, + globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter ); } @@ -755,7 +1035,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { mat, readStructure, structureTo3Di, pulchra, alphabet3di, alphabetAA, camol, header, aadbw, hdbw, torsiondbw, cadbw, par.chainNameMode, par.maskBfactorThreshold, tooShort, notProtein, globalCnt, thread_idx, par.coordStoreMode, - obj_name, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, + globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter ); } @@ -806,7 +1086,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { mat, readStructure, structureTo3Di, pulchra, alphabet3di, alphabetAA, camol, header, aadbw, hdbw, torsiondbw, cadbw, par.chainNameMode, par.maskBfactorThreshold, tooShort, notProtein, globalCnt, thread_idx, par.coordStoreMode, - dbname, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, + globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter ); } @@ -972,16 +1252,15 @@ int structcreatedb(int argc, const char **argv, const Command& command) { entry.entryName = removeModel(entryNameWithModel); std::pair fileIdModelEntry = entrynameToFileId[entryNameWithModel]; size_t fileId = fileIdModelEntry.first; - if(modelFileIdLookup.find(fileIdModelEntry) == modelFileIdLookup.end()){ + if (modelFileIdLookup.find(fileIdModelEntry) == modelFileIdLookup.end()) { modelFileIdLookup[fileIdModelEntry] = globalFileNumber; entry.fileNumber = globalFileNumber; globalFileNumber++; needToWriteSource = true; - }else{ + } else { entry.fileNumber = modelFileIdLookup[fileIdModelEntry]; needToWriteSource = false; } - maxFileId = std::max(fileId, maxFileId); readerHeader.lookupEntryToBuffer(buffer, entry); size_t written = fwrite(buffer.c_str(), sizeof(char), buffer.size(), file); @@ -990,9 +1269,9 @@ int structcreatedb(int argc, const char **argv, const Command& command) { EXIT(EXIT_FAILURE); } buffer.clear(); - if(needToWriteSource) { + if (needToWriteSource) { std::string filename = FileUtil::baseName(fileIdToName[fileId]); - if(needToWriteModel) { + if (needToWriteModel) { filename.append("_MODEL_"); filename.append(SSTR(fileIdModelEntry.second)); } diff --git a/src/strucclustutils/structurealign.cpp b/src/strucclustutils/structurealign.cpp index d52bf20..2fb525c 100644 --- a/src/strucclustutils/structurealign.cpp +++ b/src/strucclustutils/structurealign.cpp @@ -388,7 +388,8 @@ int structurealign(int argc, const char **argv, const Command& command) { res.qStartPos, res.dbStartPos, res.backtrace, - std::min(static_cast(res.backtrace.size()), std::min(res.dbLen, res.qLen))); + TMaligner::normalization(par.tmScoreThrMode, std::min(res.qEndPos - res.qStartPos, res.dbEndPos - res.dbStartPos ), res.qLen, res.dbLen)); + if (tmres.tmscore < par.tmScoreThr) { continue; } diff --git a/src/strucclustutils/structureconvertalis.cpp b/src/strucclustutils/structureconvertalis.cpp index 6c0e8da..8b511d5 100644 --- a/src/strucclustutils/structureconvertalis.cpp +++ b/src/strucclustutils/structureconvertalis.cpp @@ -522,7 +522,6 @@ R"html( Coordinate16 tcoords; std::string tmpBt; - double rmsd = 0.0; #pragma omp for schedule(dynamic, 10) for (size_t i = 0; i < alnDbr.getSize(); i++) { progress.updateProgress(); @@ -654,11 +653,6 @@ R"html( if(needTMaligner){ tmaligner->initQuery(queryCaData, &queryCaData[res.qLen], &queryCaData[res.qLen+res.qLen], NULL, res.qLen); tmpBt = Matcher::uncompressAlignment(res.backtrace); - tmres = tmaligner->computeTMscore(targetCaData, &targetCaData[res.dbLen], &targetCaData[res.dbLen+res.dbLen], res.dbLen, - res.qStartPos, res.dbStartPos, tmpBt, - std::min(std::min(res.dbLen, res.qLen), static_cast(tmpBt.size()))); - rmsd = tmres.rmsd; - } LDDTCalculator::LDDTScoreResult lddtres; if(needLDDT) { @@ -885,22 +879,25 @@ R"html( result.append(SSTR(tmres.t[2])); break; case LocalParameters::OUTFMT_ALNTMSCORE: + tmres = tmaligner->computeTMscore(targetCaData, &targetCaData[res.dbLen], &targetCaData[res.dbLen+res.dbLen], res.dbLen, + res.qStartPos, res.dbStartPos, tmpBt, + std::min(res.qEndPos - res.qStartPos, res.dbEndPos - res.dbStartPos)); result.append(SSTR(tmres.tmscore)); break; case LocalParameters::OUTFMT_QTMSCORE: tmres = tmaligner->computeTMscore(targetCaData, &targetCaData[res.dbLen], &targetCaData[res.dbLen+res.dbLen], res.dbLen, - res.qStartPos, res.dbStartPos, Matcher::uncompressAlignment(res.backtrace), - res.qLen); + res.qStartPos, res.dbStartPos, tmpBt,res.qLen); result.append(SSTR(tmres.tmscore)); break; case LocalParameters::OUTFMT_TTMSCORE: tmres = tmaligner->computeTMscore(targetCaData, &targetCaData[res.dbLen], &targetCaData[res.dbLen+res.dbLen], res.dbLen, - res.qStartPos, res.dbStartPos, Matcher::uncompressAlignment(res.backtrace), - res.dbLen); + res.qStartPos, res.dbStartPos, tmpBt, res.dbLen); result.append(SSTR(tmres.tmscore)); break; case LocalParameters::OUTFMT_RMSD: - result.append(SSTR(rmsd)); + tmres = tmaligner->computeTMscore(targetCaData, &targetCaData[res.dbLen], &targetCaData[res.dbLen+res.dbLen], res.dbLen, + res.qStartPos, res.dbStartPos, tmpBt, res.dbLen); + result.append(SSTR(tmres.rmsd)); break; case LocalParameters::OUTFMT_LDDT: // TODO: make SSTR_approx that outputs %2f, not %3f diff --git a/src/strucclustutils/structurerescorediagonal.cpp b/src/strucclustutils/structurerescorediagonal.cpp index 159f50a..b7aa43d 100644 --- a/src/strucclustutils/structurerescorediagonal.cpp +++ b/src/strucclustutils/structurerescorediagonal.cpp @@ -344,7 +344,7 @@ int structureungappedalign(int argc, const char **argv, const Command& command) float* targetCaData = tcoords.read(tcadata, res.dbLen, tCaLength); TMaligner::TMscoreResult tmres = tmaligner->computeTMscore(targetCaData, &targetCaData[res.dbLen], &targetCaData[res.dbLen+res.dbLen], res.dbLen, res.qStartPos, res.dbStartPos, Matcher::uncompressAlignment(res.backtrace), - std::min(static_cast(res.backtrace.length()), std::min(res.dbLen, res.qLen))); + TMaligner::normalization(par.tmScoreThrMode, std::min(res.qEndPos - res.qStartPos, res.dbEndPos - res.dbStartPos ), res.qLen, res.dbLen)); if(tmres.tmscore < par.tmScoreThr){ continue; } diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt index f2bad89..88cc3da 100644 --- a/src/workflow/CMakeLists.txt +++ b/src/workflow/CMakeLists.txt @@ -8,5 +8,7 @@ set(workflow_source_files workflow/EasyStructureCluster.cpp workflow/EasyMultimerSearch.cpp workflow/MultimerSearch.cpp + workflow/MultimerCluster.cpp + workflow/EasyMultimerCluster.cpp PARENT_SCOPE ) diff --git a/src/workflow/EasyMultimerCluster.cpp b/src/workflow/EasyMultimerCluster.cpp new file mode 100644 index 0000000..129a9b0 --- /dev/null +++ b/src/workflow/EasyMultimerCluster.cpp @@ -0,0 +1,77 @@ +#include + +#include "LocalParameters.h" +#include "FileUtil.h" +#include "CommandCaller.h" +#include "Util.h" +#include "Debug.h" + +#include "easymultimercluster.sh.h" + +void setEasyMultimerClusterDefaults(Parameters *p) { + //TODO + p->removeTmpFiles = true; + p->writeLookup = true; +} + +void setEasyMultimerClusterMustPassAlong(Parameters *p) { + //TODO + p->PARAM_REMOVE_TMP_FILES.wasSet = true; + p->PARAM_WRITE_LOOKUP.wasSet = true; +} + +int easymultimercluster(int argc, const char **argv, const Command &command) { + LocalParameters &par = LocalParameters::getLocalInstance(); + //TODO + par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); //prefilter + par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT); + + for (size_t i = 0; i < par.createdb.size(); i++){ + par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); + } + par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + + setEasyMultimerClusterDefaults(&par); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + setEasyMultimerClusterMustPassAlong(&par); + + std::string tmpDir = par.filenames.back(); + std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); + if (par.reuseLatest) { + hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); + } + + tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); + par.filenames.pop_back(); + + CommandCaller cmd; + cmd.addVariable("TMP_PATH", tmpDir.c_str()); + cmd.addVariable("RESULT", par.filenames.back().c_str()); + par.filenames.pop_back(); + cmd.addVariable("INPUT", par.filenames.back().c_str()); + par.filenames.pop_back(); + + cmd.addVariable("RUNNER", par.runner.c_str()); + cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.structurecreatedb).c_str()); + cmd.addVariable("MULTIMERCLUSTER_PAR", par.createParameterString(par.multimerclusterworkflow,true).c_str()); + cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); + cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str()); + cmd.addVariable("RESULT2REPSEQ_PAR", par.createParameterString(par.result2repseq).c_str()); + cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); + cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); + + std::string program = tmpDir + "/easymultimercluster.sh"; + FileUtil::writeFile(program, easymultimercluster_sh, easymultimercluster_sh_len); + + cmd.execProgram(program.c_str(), par.filenames); + + // Should never get here + assert(false); + return EXIT_FAILURE; +} diff --git a/src/workflow/EasyStructureSearch.cpp b/src/workflow/EasyStructureSearch.cpp index c59cc57..49b69b1 100644 --- a/src/workflow/EasyStructureSearch.cpp +++ b/src/workflow/EasyStructureSearch.cpp @@ -20,6 +20,7 @@ void setEasyStructureSearchDefaults(Parameters *p) { p->gapExtend = 1; p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; p->removeTmpFiles = true; + p->reportMode = 2; } void setEasyStructureSearchMustPassAlong(Parameters *p) { p->PARAM_K.wasSet = true; @@ -50,6 +51,8 @@ int easystructuresearch(int argc, const char **argv, const Command &command) { par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.overrideParameterDescription(par.PARAM_REPORT_MODE, "Taxonomy report mode 0: Kraken 1: Krona 2: Skip taxonomy report", "^[0-2]{1}$", 0); + setEasyStructureSearchDefaults(&par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); setEasyStructureSearchMustPassAlong(&par); @@ -69,6 +72,10 @@ int easystructuresearch(int argc, const char **argv, const Command &command) { bool needLDDT = false; LocalParameters::getOutputFormat(par.formatAlignmentMode, par.outfmt, needSequenceDB, needBacktrace, needFullHeaders, needLookup, needSource, needTaxonomyMapping, needTaxonomy, needQCA, needTCA, needTMalign, needLDDT); + if (par.reportMode != 2) { + needTaxonomy = true; + needTaxonomyMapping = true; + } } if (par.formatAlignmentMode == Parameters::FORMAT_ALIGNMENT_SAM || @@ -129,6 +136,9 @@ int easystructuresearch(int argc, const char **argv, const Command &command) { cmd.addVariable("CONVERT_PAR", par.createParameterString(par.convertalignments).c_str()); cmd.addVariable("SUMMARIZE_PAR", par.createParameterString(par.summarizeresult).c_str()); + cmd.addVariable("TAXONOMY", needTaxonomy && needTaxonomyMapping && par.reportMode != 2 ? "TRUE" : NULL); + cmd.addVariable("TAXONOMYREPORT_PAR", par.createParameterString(par.taxonomyreport).c_str()); + std::string program = tmpDir + "/easystructuresearch.sh"; FileUtil::writeFile(program, easystructuresearch_sh, easystructuresearch_sh_len); cmd.execProgram(program.c_str(), par.filenames); diff --git a/src/workflow/MultimerCluster.cpp b/src/workflow/MultimerCluster.cpp new file mode 100644 index 0000000..688aac4 --- /dev/null +++ b/src/workflow/MultimerCluster.cpp @@ -0,0 +1,63 @@ +#include + +#include "FileUtil.h" +#include "CommandCaller.h" +#include "Util.h" +#include "LocalParameters.h" +#include "Debug.h" +#include "multimercluster.sh.h" + +void setMultimerClusterDefaults(LocalParameters *p) { + p->filtMultimerTmThr = 0.65; // FIX + p->filtChainTmThr = 0.001; // FIX + p->filtInterfaceLddtThr = 0.5; // FIX +} + +int multimercluster(int argc, const char **argv, const Command &command) { + LocalParameters &par = LocalParameters::getLocalInstance(); + par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); //prefilter + par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + for (size_t i = 0; i < par.createdb.size(); i++){ + par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); + } + par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + + setMultimerClusterDefaults(&par); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + + std::string tmpDir = par.filenames.back(); + std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); + if (par.reuseLatest) { + hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); + } + tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); + par.filenames.pop_back(); + + CommandCaller cmd; + std::cout<