From 966b93b2433fe4ef708c5d0805f496b1cb4feca4 Mon Sep 17 00:00:00 2001 From: Jacob Morrison Date: Mon, 6 Nov 2023 14:56:32 -0500 Subject: [PATCH] make bc compatible with umi-tools format --- src/bc.c | 26 ++++++++++++++++---------- src/bc.h | 11 ++++++++++- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/bc.c b/src/bc.c index a268605..04d0eff 100644 --- a/src/bc.c +++ b/src/bc.c @@ -65,14 +65,16 @@ int prepare_read_se(kseq_t *k, kstring_t *s, bc_conf_t *conf) { } } + // Remove "/1" or "/2" from the end of the read name + remove_read_number(k); + // Create read entry // NOTE: This assumes there was a FASTQ comment, it may be worth it to look into how // to handle cases where there aren't comments to start with ksprintf( s, - "@%s %s:%.*s\n%.*s%s\n+\n%.*s%s\n", - k->name.s, k->comment.s, - conf->bc_length, k->seq.s+conf->bc_start, + "@%s_%.*s_AAAAAAAA %s\n%.*s%s\n+\n%.*s%s\n", + k->name.s, conf->bc_length, k->seq.s+conf->bc_start, k->comment.s, conf->bc_start, k->seq.s, k->seq.s+conf->bc_start+conf->bc_length, conf->bc_start, k->qual.s, k->qual.s+conf->bc_start+conf->bc_length ); @@ -112,14 +114,17 @@ int prepare_read_pe(kseq_t *k1, kseq_t *k2, kstring_t *s1, kstring_t *s2, bc_con } } + // Remove "/1" or "/2" from the end of the read name + remove_read_number(k_has_bc); + remove_read_number(k_not_bc); + // Create entry for read with barcode // NOTE: This assumes there was a FASTQ comment, it may be worth it to look into how // to handle cases where there aren't comments to start with ksprintf( s_has_bc, - "@%s %s:%.*s\n%.*s%s\n+\n%.*s%s\n", - k_has_bc->name.s, k_has_bc->comment.s, - conf->bc_length, k_has_bc->seq.s+conf->bc_start, + "@%s_%.*s_AAAAAAAA %s\n%.*s%s\n+\n%.*s%s\n", + k_has_bc->name.s, conf->bc_length, k_has_bc->seq.s+conf->bc_start, k_has_bc->comment.s, conf->bc_start, k_has_bc->seq.s, k_has_bc->seq.s+conf->bc_start+conf->bc_length, conf->bc_start, k_has_bc->qual.s, k_has_bc->qual.s+conf->bc_start+conf->bc_length ); @@ -129,9 +134,8 @@ int prepare_read_pe(kseq_t *k1, kseq_t *k2, kstring_t *s1, kstring_t *s2, bc_con // to handle cases where there aren't comments to start with ksprintf( s_not_bc, - "@%s %s:%.*s\n%s\n+\n%s\n", - k_not_bc->name.s, k_not_bc->comment.s, - conf->bc_length, k_has_bc->seq.s+conf->bc_start, + "@%s_%.*s_AAAAAAAA %s\n%s\n+\n%s\n", + k_not_bc->name.s, conf->bc_length, k_has_bc->seq.s+conf->bc_start, k_not_bc->comment.s, k_not_bc->seq.s, k_not_bc->qual.s ); @@ -231,7 +235,9 @@ static void usage() { fprintf(stderr, "General Options:\n"); fprintf(stderr, " -h, --help This help\n"); fprintf(stderr, "\n"); - fprintf(stderr, "Note: When writing to stdout, reads 1 and 2 will alternate (i.e., are interleaved)\n"); + fprintf(stderr, "Note 1: When writing to stdout, reads 1 and 2 will alternate (i.e., are interleaved)\n"); + fprintf(stderr, "Note 2: Also adds an artificial UMI (AAAAAAAA) for compatibility purposes and to serve\n"); + fprintf(stderr, " as a placeholder for future UMI work\n"); fprintf(stderr, "\n"); } diff --git a/src/bc.h b/src/bc.h index 055934d..416563e 100644 --- a/src/bc.h +++ b/src/bc.h @@ -37,7 +37,7 @@ KSEQ_INIT(gzFile, gzread) // Number of extra bytes to allocate -#define N_EXTRA 32 +#define N_EXTRA 64 // uint8_t should be large enough for now, but if barcodes start to occur // in locations further in from the start of the read or barcode lengths @@ -58,6 +58,15 @@ static inline void bc_conf_init(bc_conf_t *conf) { gzFile setup_output(const char *ofile, uint8_t read_num); +// Remove "/1" and "/2" from read name +static inline void remove_read_number(kseq_t *k) { + size_t len = k->name.l; + if (len > 2 && k->name.s[len-2] == '/' && (k->name.s[len-1] == '1' || k->name.s[len-1] == '2')) { + k->name.s[len-2] = '\0'; + k->name.l = len-2; + } +} + int prepare_read_se(kseq_t *k, kstring_t *s, bc_conf_t *conf); void extract_barcodes(bc_conf_t *conf, kseq_t *ks1, kseq_t *ks2, gzFile oh1, gzFile oh2);