Skip to content

Commit

Permalink
Allow paired reads to be read from separate sequence files
Browse files Browse the repository at this point in the history
  • Loading branch information
drpowell committed May 22, 2012
1 parent 06f7ca5 commit ce5d396
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 11 deletions.
75 changes: 65 additions & 10 deletions src/readSet.c
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,48 @@ static void readFastXFile(int fileType, SequencesWriter *seqWriteInfo, char *fil
velvetLog("Done\n");
}

static void readFastXPair(int fileType, SequencesWriter *seqWriteInfo, char *filename1, char *filename2, Category cat, IDnum * sequenceIndex)
{
kseq_t *seq1, *seq2;
gzFile file1, file2;
IDnum counter = 0;

if (cat==REFERENCE)
exitErrorf(EXIT_FAILURE, false, "Cannot read reference sequence in 'separate' read mode");

file1 = openFastXFile(fileType, filename1);
file2 = openFastXFile(fileType, filename2);
initFastX(seqWriteInfo, cat);

// Read a sequence at a time
seq1 = kseq_init(file1);
seq2 = kseq_init(file2);
while (kseq_read(seq1) >= 0) {
counter++;
writeSeqName(seq1->name.s, seqWriteInfo, cat, sequenceIndex);
writeSequence(seq1->seq.s, seqWriteInfo);

if (kseq_read(seq2) < 0)
exitErrorf(EXIT_FAILURE, false, "Right sequence file '%s' has too few sequences", filename2);

counter++;
writeSeqName(seq2->name.s, seqWriteInfo, cat, sequenceIndex);
writeSequence(seq2->seq.s, seqWriteInfo);
}
if (kseq_read(seq2) >= 0)
exitErrorf(EXIT_FAILURE, false, "Right sequence file '%s' has too many sequences", filename2);

kseq_destroy(seq1);
kseq_destroy(seq2);
gzclose(file1);
gzclose(file2);

cleanupFastX(seqWriteInfo, cat);

velvetLog("%li sequences found in total in the paired sequence files\n", (long) counter);
velvetLog("Done\n");
}

static void addMapping(boolean orientation, Coordinate pos, char * seq, ReferenceCoordinate * refCoord, char * buffer, SequencesWriter * seqWriteInfo, RefInfoList ** refTail, size_t * buffer_size) {
if (isCreateBinary()) {
seqWriteInfo->m_bIsRef = true;
Expand Down Expand Up @@ -1120,7 +1162,7 @@ static void readBAMFile(SequencesWriter *seqWriteInfo, char *filename, Category
static void printUsage()
{
puts("Usage:");
puts("./velveth directory hash_length {[-file_format][-read_type] filename} [options]");
puts("./velveth directory hash_length {[-file_format][-read_type][-separate|-interleaved] filename} [options]");
puts("");
puts("\tdirectory\t\t: directory name for output files");
printf("\thash_length\t\t: odd integer (if even, it will be decremented) <= %i (if above, will be reduced)\n", MAXKMERLENGTH);
Expand Down Expand Up @@ -1165,6 +1207,7 @@ void parseDataAndReadFiles(char * filename, int argc, char **argv, boolean * dou
short short_var;
ReferenceCoordinateTable * refCoords = newReferenceCoordinateTable();
boolean reuseSequences = false;
boolean separate_pair_files = false;

if (argc < 2) {
printUsage();
Expand Down Expand Up @@ -1265,6 +1308,10 @@ void parseDataAndReadFiles(char * filename, int argc, char **argv, boolean * dou
;
} else if (strcmp(argv[argIndex], "-create_binary") == 0) {
;
} else if (strcmp(argv[argIndex], "-interleaved") == 0) {
separate_pair_files = false;
} else if (strcmp(argv[argIndex], "-separate") == 0) {
separate_pair_files = true;
}
else {
velvetLog("Unknown option: %s\n",
Expand All @@ -1283,21 +1330,29 @@ void parseDataAndReadFiles(char * filename, int argc, char **argv, boolean * dou

switch (filetype) {
case FASTA:
readFastXFile(FASTA, seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
break;
case FASTQ:
readFastXFile(FASTQ, seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
case FASTA_GZ:
case FASTQ_GZ:
// Separate files for paired reads? Note odd categories used for paired read type
if (separate_pair_files && cat%2==1) {
argIndex++;
if (argIndex>=argc)
exitErrorf(EXIT_FAILURE, false, "Require left & right filename for -separate mode");
readFastXPair(filetype, seqWriteInfo, argv[argIndex-1], argv[argIndex], cat, &sequenceIndex);
} else {
readFastXFile(filetype, seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
}
break;
case RAW:
if (separate_pair_files && cat%2==1) {
exitErrorf(EXIT_FAILURE, false, "Currently do not support -separate mode for RAW");
}
readRawFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex);
break;
case FASTA_GZ:
readFastXFile(FASTA_GZ, seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
break;
case FASTQ_GZ:
readFastXFile(FASTQ_GZ, seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
break;
case RAW_GZ:
if (separate_pair_files && cat%2==1) {
exitErrorf(EXIT_FAILURE, false, "Currently do not support -separate mode for RAW");
}
readRawGZFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex);
break;
case SAM:
Expand Down
9 changes: 8 additions & 1 deletion src/run.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Copyright 2007, 2008 Daniel Zerbino ([email protected])
static void printUsage()
{
puts("Usage:");
puts("./velveth directory hash_length {[-file_format][-read_type] filename1 [filename2 ...]} {...} [options]");
puts("./velveth directory hash_length {[-file_format][-read_type][-separate|-interleaved] filename1 [filename2 ...]} {...} [options]");
puts("");
puts("\tdirectory\t: directory name for output files");
printf("\thash_length\t: EITHER an odd integer (if even, it will be decremented) <= %i (if above, will be reduced)\n", MAXKMERLENGTH);
Expand All @@ -44,6 +44,10 @@ static void printUsage()
puts("File format options:");
puts("\t-fasta\t-fastq\t-raw\t-fasta.gz\t-fastq.gz\t-raw.gz\t-sam\t-bam");
puts("");
puts("File layout options for paired reads (only for fasta and fastq formats):");
puts("\t-interleaved\t: File contains paired reads interleaved in the one file (default)");
puts("\t-separate\t: Read 2 separate files for paired reads");
puts("");
puts("Read type options:");
puts("\t-short\t-shortPaired");
#if CATEGORIES <= 5
Expand Down Expand Up @@ -72,6 +76,9 @@ static void printUsage()
puts("- Paired-end short reads (remember to interleave paired reads):");
puts("\tvelveth Assem 31 -shortPaired -fasta interleaved.fna");
puts("");
puts("- Paired-end short reads (using separate files for the paired reads)");
puts("\tvelveth Assem 31 -shortPaired -fasta -separate left.fa right.fa");
puts("");
puts("- Two channels and some long reads:");
puts("\tvelveth Assem 43 -short -fastq unmapped.fna -longPaired -fasta SangerReads.fasta");
puts("");
Expand Down

0 comments on commit ce5d396

Please sign in to comment.