diff --git a/earlGrey b/earlGrey index 45d5cd0..ed5684c 100644 --- a/earlGrey +++ b/earlGrey @@ -18,6 +18,7 @@ usage() -c == Cluster TE library to reduce redundancy? (yes/no, Default: no) -m == Remove putative spurious TE annotations <100bp? (yes/no, Default: no) -d == Create soft-masked genome at the end? (yes/no, Default: no) + -n == Max number of sequences used to generate consensus sequences (Default: 20) -h == Show help Example Usage: @@ -159,7 +160,7 @@ deNovo1() strainer() { cd ${OUTDIR}/${species}_strainer/ - ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh -g $genome -l ${OUTDIR}/${species}_Database/${species}-families.fa -t ${ProcNum} -f $Flank -r $num + ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh -g $genome -l ${OUTDIR}/${species}_Database/${species}-families.fa -t ${ProcNum} -f $Flank -r $num -n $no_seq latestFile="$(realpath $(ls -td -- ${OUTDIR}/${species}_strainer/*/ | head -n 1))/${species}-families.fa.strained" cp $latestFile ${OUTDIR}/${species}_strainer/ } @@ -303,6 +304,12 @@ Checks() else echo "De Novo Sequences Will Be Extended Through a Maximum of $num Iterations" fi + + if [ -z "$no_seq" ] ; then + no_seq=20; echo "$no_seq sequences will be used in BEAT consensus generation" + else + echo "$no_seq sequences will be used in BEAT consensus generation" + fi if [ -z "$cluster" ] || [ "$cluster" == "no" ] ; then cluster=no; echo "TE library consensus sequences will not be clustered" @@ -373,7 +380,7 @@ Checks() # Main # -while getopts g:s:o:t:f:i:r:c:l:m:d:h option +while getopts g:s:o:t:f:i:r:c:l:m:d:n:h option do case "${option}" in @@ -388,6 +395,7 @@ do c) cluster=${OPTARG};; m) margin=${OPTARG};; d) softMask=${OPTARG};; + n) no_seq=${OPTARG};; h) usage; exit 0;; esac done diff --git a/scripts/TEstrainer/TEstrainer_for_earlGrey.sh b/scripts/TEstrainer/TEstrainer_for_earlGrey.sh index 85547d2..7139e64 100644 --- a/scripts/TEstrainer/TEstrainer_for_earlGrey.sh +++ b/scripts/TEstrainer/TEstrainer_for_earlGrey.sh @@ -7,13 +7,14 @@ STRAIN_SCRIPTS=INSERT_FILENAME_HERE FLANK=1000 THREADS=4 RUNS=10 +NO_SEQ=20 # for potential folder name TIME=$(date +"%s") TIME=${TIME: -4} MEM_FREE="200M" # parsing -while getopts l:g:t:f:r:d:h:M flag; do +while getopts l:g:t:f:r:d:h:n:M flag; do case "${flag}" in l) RM_LIBRARY_PATH=${OPTARG};; g) GENOME=${OPTARG};; @@ -21,6 +22,7 @@ while getopts l:g:t:f:r:d:h:M flag; do f) FLANK=${OPTARG};; r) RUNS=${OPTARG};; d) DATA_DIR=${OPTARG};; + n) NO_SEQ=${OPTARG};; M) MEM_FREE=${OPTARG};; h | *) print_usage @@ -93,7 +95,7 @@ do parallel --bar --jobs ${THREADS} --memfree ${MEM_FREE} -a ${DATA_DIR}/run_${RUN_NO}/raw/${RM_LIBRARY}_split.txt trf ${DATA_DIR}/run_${RUN_NO}/raw/{} 2 7 7 80 10 50 500 -d -h -ngs ">" ${DATA_DIR}/run_${RUN_NO}/raw/{}.trf echo "Initial blast and preparation for MSA "${RUN_NO} # initial blast and extention - parallel --bar --jobs ${THREADS} --memfree ${MEM_FREE} -a ${DATA_DIR}/run_${RUN_NO}/raw/${RM_LIBRARY}_split.txt python3 ${STRAIN_SCRIPTS}/initial_mafft_setup.py -d ${DATA_DIR} -r ${RUN_NO} -s {} -g ${GENOME} -f ${FLANK} -D + parallel --bar --jobs ${THREADS} --memfree ${MEM_FREE} -a ${DATA_DIR}/run_${RUN_NO}/raw/${RM_LIBRARY}_split.txt python3 ${STRAIN_SCRIPTS}/initial_mafft_setup.py -d ${DATA_DIR} -r ${RUN_NO} -s {} -g ${GENOME} -f ${FLANK} -D -n ${NO_SEQ} ## first mafft alignment find ${DATA_DIR}/run_${RUN_NO}/to_align -type f | sed 's/.*\///' > ${DATA_DIR}/run_${RUN_NO}/to_align.txt