Skip to content

Commit

Permalink
added citation, updated sample sheet handling
Browse files Browse the repository at this point in the history
  • Loading branch information
AmandaBirmingham committed Sep 24, 2024
1 parent fd8364f commit d73af00
Show file tree
Hide file tree
Showing 13 changed files with 440 additions and 95 deletions.
69 changes: 51 additions & 18 deletions q2_surpi/_formats_and_types.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import io
import pandas
from qiime2.plugin import SemanticType, ValidationError
import qiime2.plugin.model as model
Expand All @@ -7,7 +8,9 @@
GENUS_KEY = "genus"
FAMILY_KEY = "family"
TAG_KEY = "tag"
SAMPLE_NAME_KEY = 'sample'
SAMPLE_NAME_KEY = 'Sample_Name'
INDEX_1_KEY = "index"
INDEX_2_KEY = "index2"
BARCODE_KEY = 'barcode'


Expand Down Expand Up @@ -39,26 +42,56 @@ def _validate_(self, level):
raise ValidationError("Expected at least one row, but got none")


# TODO: this is speculative code and may need to be adjusted; I don't
# know yet what the sample info looks like
class SurpiSampleSheetFormat(model.TextFileFormat):
"""Represents a tab-delimited sample sheet file used by SURPI+."""
"""Represents a csv-delimited sample sheet file used by SURPI+."""

def _validate_(self, level):
# Validate that the file is a tsv and that it has the expected columns
# for those that are fixed. Note that we don't validate the values in
# the columns, as we don't know what they should be.
with self.path.open("r") as f:
df = pandas.read_csv(f, header=0, sep='\t')

if ((SAMPLE_NAME_KEY not in df.columns) or
(BARCODE_KEY not in df.columns)):
raise ValidationError(
f"Expected '{SAMPLE_NAME_KEY}' and '{BARCODE_KEY}' columns, "
f"but got {df.columns}")

if len(df) == 0:
raise ValidationError("Expected at least one row, but got none")
_ = surpi_count_fp_to_df(self.path)


def surpi_count_fp_to_df(fp: str) -> pandas.DataFrame:
# open the file and count each line until we find one that starts with
# [Data]

data_table_lines = []
is_data = False
with open(fp, "r") as f:
for line in f:
if line.startswith("[Data]"):
is_data = True
continue
# endif line.startswith("[Data]")

if is_data and not line.startswith(','):
data_table_lines.append(line)
# endif is_data and not line.startswith(',')
# endfor line in f
# endwith self.path.open("r") as f

if len(data_table_lines) == 0:
raise ValidationError(
"Expected section starting with '[Data]', but didn't find one")

# create a streamio object from the list of lines
data_table_stream = io.StringIO("\n".join(data_table_lines))

# Validate that the file is a tsv and that it has the expected columns
# for those that are fixed. Note that we don't validate the values in
# the columns, as we don't know what they should be.
df = pandas.read_csv(data_table_stream, header=0, sep=',')

if ((SAMPLE_NAME_KEY not in df.columns) or
(INDEX_1_KEY not in df.columns) or
(INDEX_2_KEY not in df.columns)):
raise ValidationError(
f"Expected at least '{SAMPLE_NAME_KEY}', '{INDEX_1_KEY}', and "
f"'{INDEX_2_KEY}' columns, but got {df.columns}")

if len(df) == 0:
raise ValidationError("Expected at least one row, but got none")

df[BARCODE_KEY] = df[INDEX_1_KEY] + "+" + df[INDEX_2_KEY]
return df


SurpiCountTableDirectoryFormat = model.SingleFileDirectoryFormat(
Expand Down
11 changes: 11 additions & 0 deletions q2_surpi/citations.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
@ARTICLE{Miller2019,
title={Laboratory validation of a clinical metagenomic sequencing assay for pathogen detection in cerebrospinal fluid},
author={Miller, Steve and Naccache, Samia N and Samayoa, Erik and Messacar, Kevin and Arevalo, Shaun and Federman, Scot and Stryke, Doug and Pham, Elizabeth and Fung, Becky and Bolosky, William J and Ingebrigtsen, Danielle and Lorizio, Walter and Paff, Sandra M and Leake, John A and Pesano, Rick and DeBiasi, Roberta and Dominguez, Samuel and Chiu, Charles Y},
journal={Genome Res.},
publisher={Cold Spring Harbor Laboratory},
volume={29},
number={5},
pages={831--842},
month={may},
year={2019}
}
5 changes: 3 additions & 2 deletions q2_surpi/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import q2_surpi
from q2_surpi._formats_and_types import (
SurpiCountTable, SurpiCountTableFormat, SurpiCountTableDirectoryFormat,
SurpiSampleSheet, SurpiSampleSheetFormat, SurpiSampleSheetDirectoryFormat)
SurpiSampleSheet, SurpiSampleSheetFormat, SurpiSampleSheetDirectoryFormat,
surpi_count_fp_to_df)


plugin = Plugin(
Expand Down Expand Up @@ -41,7 +42,7 @@ def _1(ff: SurpiCountTableFormat) -> pandas.DataFrame:
@plugin.register_transformer
# load a SurpiSampleSheetFormat into a dataframe
def _2(ff: SurpiSampleSheetFormat) -> pandas.DataFrame:
result = pandas.read_csv(str(ff), sep='\t', header=0)
result = surpi_count_fp_to_df(str(ff))
return result


Expand Down
101 changes: 101 additions & 0 deletions q2_surpi/tests/data/surpi_sample_info.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
[Header],,,,,,,,,
,,,,,,,,,
IEMFileVersion,4,,,,,,,,
,,,,,,,,,
Investigator Name,VS,,,,,,,,
,,,,,,,,,
Experiment Name,WW_NS_230126_qiime,,,,,,,,
,,,,,,,,,
Date,1/26/23,,,,,,,,
,,,,,,,,,
Workflow,GenerateFASTQ,,,,,,,,
,,,,,,,,,
Application,FASTQ Only,,,,,,,,
,,,,,,,,,
Assay,NEB,,,,,,,,
,,,,,,,,,
Description,,,,,,,,,
,,,,,,,,,
Chemistry,Amplicon,,,,,,,,
,,,,,,,,,
,,,,,,,,,
,,,,,,,,,
[Reads],,,,,,,,,
,,,,,,,,,
150,,,,,,,,,
,,,,,,,,,
,,,,,,,,,
,,,,,,,,,
[Settings],,,,,,,,,
,,,,,,,,,
ReverseComplement,0,,,,,,,,
,,,,,,,,,
,,,,,,,,,
,,,,,,,,,
[Data],,,,,,,,,
,,,,,,,,,
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Prep,Type
,,,,,,,,,
sample-R-A1,sample-R-A1,A6,A6,7-337,AGTAGTAA,5-386,TACTGATA,RNA,Analytical
,,,,,,,,,
sample-R-B1,sample-R-B1,B6,B6,7-338,TACTAAGG,5-387,CTGACTCG,RNA,Analytical
,,,,,,,,,
sample-R-C1,sample-R-C1,C6,C6,7-339,CATTCGGA,5-388,GATGGAAA,RNA,Analytical
,,,,,,,,,
sample-R-D1,sample-R-D1,D6,D6,7-340,AATCGTCA,5-389,AGTTAAAG,RNA,Analytical
,,,,,,,,,
sample-R-E1,sample-R-E1,E6,E6,7-341,GCTGATTT,5-390,ATAGAGGC,RNA,Analytical
,,,,,,,,,
sample-R-F1,sample-R-F1,F6,F6,7-342,CGCGAAAG,5-391,AAAGGAGG,RNA,Analytical
,,,,,,,,,
sample-R-G1,sample-R-G1,G6,G6,7-343,TTGCCACT,5-392,GGGAACTG,RNA,Analytical
,,,,,,,,,
sample-R-H1,sample-R-H1,H6,H6,7-344,TTCGTGGA,5-393,ACAAGGTA,RNA,Analytical
,,,,,,,,,
sample-R-A2,sample-R-A2,A7,A7,7-345,AGTCCCGG,5-297,GCAGAAGT,RNA,Analytical
,,,,,,,,,
sample-R-B2,sample-R-B2,B7,B7,7-346,TCCTGGAC,5-298,CTGCTTAA,RNA,Analytical
,,,,,,,,,
sample-R-C2,sample-R-C2,C7,C7,7-347,CTACATGA,5-299,GATTTGAT,RNA,Analytical
,,,,,,,,,
sample-R-D2,sample-R-D2,D7,D7,7-348,CCGGATAG,5-300,GACTCAAA,RNA,Analytical
,,,,,,,,,
sample-R-E2,sample-R-E2,E7,E7,7-349,AACCCGCC,5-301,GAGGATTT,RNA,Analytical
,,,,,,,,,
sample-R-F2,sample-R-F2,F7,F7,7-350,CGAACGTG,5-302,CATCTGTA,RNA,Analytical
,,,,,,,,,
sample-R-G2,sample-R-G2,G7,G7,7-351,CCGTAGAA,5-303,TGCGCTTA,RNA,Analytical
,,,,,,,,,
sample-R-H2,sample-R-H2,H7,H7,7-352,CATCTACT,5-304,TTCCGTTG,RNA,Analytical
,,,,,,,,,
sample-D-A1,sample-D-A1,A8,A8,7-353,AGTCTGCT,5-305,TCCAGGCT,DNA,Analytical
,,,,,,,,,
sample-D-B1,sample-D-B1,B8,B8,7-354,GCCGAATC,5-306,CTGTCCTC,DNA,Analytical
,,,,,,,,,
sample-D-C1,sample-D-C1,C8,C8,7-355,ACTATGAT,5-307,TTCGATAG,DNA,Analytical
,,,,,,,,,
sample-D-D1,sample-D-D1,D8,D8,7-356,CCCTATCT,5-308,GTTAGTGA,DNA,Analytical
,,,,,,,,,
sample-D-E1,sample-D-E1,E8,E8,7-357,CGTTGTCC,5-309,CTTATCGA,DNA,Analytical
,,,,,,,,,
sample-D-F1,sample-D-F1,F8,F8,7-358,TGGAACGG,5-310,GAATAAAG,DNA,Analytical
,,,,,,,,,
sample-D-G1,sample-D-G1,G8,G8,7-359,CCCTTCGG,5-311,GAAGGCAG,DNA,Analytical
,,,,,,,,,
sample-D-H1,sample-D-H1,H8,H8,7-360,TGTCCAAA,5-312,TTGGTTGT,DNA,Analytical
,,,,,,,,,
sample-D-A2,sample-D-A2,A9,A9,7-361,AGTACAAG,5-313,CCCATTGC,DNA,Analytical
,,,,,,,,,
sample-D-B2,sample-D-B2,B9,B9,7-362,TACTGTGA,5-314,GTGTCCAG,DNA,Analytical
,,,,,,,,,
sample-D-C2,sample-D-C2,C9,C9,7-363,CCGGAATT,5-315,GCATACTT,DNA,Analytical
,,,,,,,,,
sample-D-D2,sample-D-D2,D9,D9,7-364,TCGCTCGG,5-316,CCATCGGA,DNA,Analytical
,,,,,,,,,
sample-D-E2,sample-D-E2,E9,E9,7-365,AGTGCGGA,5-317,CCGTTGTC,DNA,Analytical
,,,,,,,,,
sample-D-F2,sample-D-F2,F9,F9,7-366,GCTTCACA,5-318,TAAAGCTA,DNA,Analytical
,,,,,,,,,
sample-D-G2,sample-D-G2,G9,G9,7-367,CCGATCGT,5-319,GACTGTTT,DNA,Analytical
,,,,,,,,,
sample-D-H2,sample-D-H2,H9,H9,7-368,CCGTAAGC,5-320,AGTGAGGT,DNA,Analytical
33 changes: 0 additions & 33 deletions q2_surpi/tests/data/surpi_sample_info.txt

This file was deleted.

1 change: 0 additions & 1 deletion q2_surpi/tests/data/surpi_sample_info_empty.txt

This file was deleted.

36 changes: 36 additions & 0 deletions q2_surpi/tests/data/surpi_sample_info_empty_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
[Header],,,,,,,,,
,,,,,,,,,
IEMFileVersion,4,,,,,,,,
,,,,,,,,,
Investigator Name,VS,,,,,,,,
,,,,,,,,,
Experiment Name,WW_NS_230126_qiime,,,,,,,,
,,,,,,,,,
Date,1/26/23,,,,,,,,
,,,,,,,,,
Workflow,GenerateFASTQ,,,,,,,,
,,,,,,,,,
Application,FASTQ Only,,,,,,,,
,,,,,,,,,
Assay,NEB,,,,,,,,
,,,,,,,,,
Description,,,,,,,,,
,,,,,,,,,
Chemistry,Amplicon,,,,,,,,
,,,,,,,,,
,,,,,,,,,
,,,,,,,,,
[Reads],,,,,,,,,
,,,,,,,,,
150,,,,,,,,,
,,,,,,,,,
,,,,,,,,,
,,,,,,,,,
[Settings],,,,,,,,,
,,,,,,,,,
ReverseComplement,0,,,,,,,,
,,,,,,,,,
,,,,,,,,,
,,,,,,,,,
[Data],,,,,,,,,
,,,,,,,,,
4 changes: 0 additions & 4 deletions q2_surpi/tests/data/surpi_sample_info_missing_barcode.txt

This file was deleted.

Loading

0 comments on commit d73af00

Please sign in to comment.