Skip to content

Commit

Permalink
make align work with umi-tools compatible barcode format
Browse files Browse the repository at this point in the history
  • Loading branch information
jamorrison committed Nov 6, 2023
1 parent 966b93b commit d0b9354
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 8 deletions.
4 changes: 2 additions & 2 deletions lib/aln/align.c
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ static void *process(void *shared, int step, void *_data) {
} else if (step == 2) {
for (i = 0; i < data->n_seqs; ++i) {
if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, stdout);
free(data->seqs[i].name); free(data->seqs[i].comment); free(data->seqs[i].barcode);
free(data->seqs[i].name); free(data->seqs[i].comment); free(data->seqs[i].barcode); free(data->seqs[i].umi);
free(data->seqs[i].seq0); free(data->seqs[i].qual); free(data->seqs[i].sam);
/* bisulfite free, the pointers can be NULL */
free(data->seqs[i].bisseq[0]);
Expand Down Expand Up @@ -257,7 +257,7 @@ int usage(mem_opt_t *opt) {
fprintf(stderr, " -S Skip mate rescue\n");
fprintf(stderr, " -P Skip pairing - mate rescue performed unless -S also given\n");
fprintf(stderr, " -e Discard full-length exact matches\n");
fprintf(stderr, " -9 Extract barcode from read comment\n");
fprintf(stderr, " -9 Extract barcode and UMI from read name\n");
fprintf(stderr, "\n");
fprintf(stderr, "Scoring options:\n");
fprintf(stderr, " -A INT Score for a sequence match, scales options -TdBOELU unless\n");
Expand Down
32 changes: 29 additions & 3 deletions lib/aln/bwa.c
Original file line number Diff line number Diff line change
Expand Up @@ -767,12 +767,38 @@ static void bis_kseq2bseq1(const kseq_t *ks, bseq1_t *s, uint8_t has_bc)
{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
s->name = strdup(ks->name.s);
s->comment = ks->comment.l? strdup(ks->comment.s) : 0;
if (has_bc && ks->comment.l) {
// No checks are performed here, so we have to rely on the user to have done the proper
if (has_bc) {
// Minimal checks are performed here, so we have to rely on the user to have done the proper
// preprocessing before processing with barcodes
s->barcode = strdup(strrchr(ks->comment.s, ':') + 1);
char *temp_string = strdup(ks->name.s);
char *temp_tok = NULL;
char *bc = NULL;
char *umi = NULL;

// Pull out first entry
temp_tok = strtok(temp_string, "_");

if (!temp_tok) {
fprintf(stderr, "[W::%s] barcode and UMI extraction requested but could not include be extracted\n", __func__);
}

// barcode
bc = strtok(NULL, "_");

// umi
umi = strtok(NULL, "_");

while ((temp_tok = strtok(NULL, "_")) != NULL) {
bc = umi;
umi = temp_tok;
}

s->barcode = strdup(bc);
s->umi = strdup(umi);
free(temp_string);
} else {
s->barcode = 0;
s->umi = 0;
}
s->seq = (uint8_t*) strdup(ks->seq.s);
s->seq0 = s->seq;
Expand Down
2 changes: 1 addition & 1 deletion lib/aln/bwa.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ typedef struct {

typedef struct {
int l_seq, id; /* check if l_seq can be unsigned? */
char *name, *comment, *barcode, *qual, *sam; /* sam stored the end output of sam record */
char *name, *comment, *barcode, *umi, *qual, *sam; /* sam stored the end output of sam record */
uint8_t *seq, *bisseq[2];
uint8_t *seq0; /* pointer to sequence beginning before clipping */
int l_seq0; /* the original l_seq before clipping */
Expand Down
2 changes: 1 addition & 1 deletion lib/aln/bwamem.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ typedef struct {
int clip5; /* extra clip from 5'-end */
int clip3; /* extra clip from 3'-end */
int min_base_qual; /* minimum base quality */
uint8_t has_bc; /* read comment has barcode in it */
uint8_t has_bc; /* read comment has barcode and UMI in it */
} mem_opt_t;

typedef struct {
Expand Down
6 changes: 5 additions & 1 deletion lib/aln/mem_alnreg_format.c
Original file line number Diff line number Diff line change
Expand Up @@ -401,9 +401,13 @@ void mem_alnreg_formatSAM(
}

if (s->barcode) {
kputsn("\tCR:Z:", 6, str);
kputsn("\tCB:Z:", 6, str);
kputs(s->barcode, str);
}
if (s->umi) {
kputsn("\tRX:Z:", 6, str);
kputs(s->umi, str);
}

// MC/MQ: CIGAR string for mate/next segment (MC) and Mapping quality for mate/next segment (MQ)
kputsn("\tMC:Z:", 6, str);
Expand Down

0 comments on commit d0b9354

Please sign in to comment.