diff --git a/q2_surpi/_formats_and_types.py b/q2_surpi/_formats_and_types.py index c95c8eb..77a2816 100644 --- a/q2_surpi/_formats_and_types.py +++ b/q2_surpi/_formats_and_types.py @@ -1,3 +1,4 @@ +import io import pandas from qiime2.plugin import SemanticType, ValidationError import qiime2.plugin.model as model @@ -7,7 +8,9 @@ GENUS_KEY = "genus" FAMILY_KEY = "family" TAG_KEY = "tag" -SAMPLE_NAME_KEY = 'sample' +SAMPLE_NAME_KEY = 'Sample_Name' +INDEX_1_KEY = "index" +INDEX_2_KEY = "index2" BARCODE_KEY = 'barcode' @@ -39,26 +42,56 @@ def _validate_(self, level): raise ValidationError("Expected at least one row, but got none") -# TODO: this is speculative code and may need to be adjusted; I don't -# know yet what the sample info looks like class SurpiSampleSheetFormat(model.TextFileFormat): - """Represents a tab-delimited sample sheet file used by SURPI+.""" + """Represents a csv-delimited sample sheet file used by SURPI+.""" def _validate_(self, level): - # Validate that the file is a tsv and that it has the expected columns - # for those that are fixed. Note that we don't validate the values in - # the columns, as we don't know what they should be. - with self.path.open("r") as f: - df = pandas.read_csv(f, header=0, sep='\t') - - if ((SAMPLE_NAME_KEY not in df.columns) or - (BARCODE_KEY not in df.columns)): - raise ValidationError( - f"Expected '{SAMPLE_NAME_KEY}' and '{BARCODE_KEY}' columns, " - f"but got {df.columns}") - - if len(df) == 0: - raise ValidationError("Expected at least one row, but got none") + _ = surpi_count_fp_to_df(self.path) + + +def surpi_count_fp_to_df(fp: str) -> pandas.DataFrame: + # open the file and count each line until we find one that starts with + # [Data] + + data_table_lines = [] + is_data = False + with open(fp, "r") as f: + for line in f: + if line.startswith("[Data]"): + is_data = True + continue + # endif line.startswith("[Data]") + + if is_data and not line.startswith(','): + data_table_lines.append(line) + # endif is_data and not line.startswith(',') + # endfor line in f + # endwith self.path.open("r") as f + + if len(data_table_lines) == 0: + raise ValidationError( + "Expected section starting with '[Data]', but didn't find one") + + # create a streamio object from the list of lines + data_table_stream = io.StringIO("\n".join(data_table_lines)) + + # Validate that the file is a tsv and that it has the expected columns + # for those that are fixed. Note that we don't validate the values in + # the columns, as we don't know what they should be. + df = pandas.read_csv(data_table_stream, header=0, sep=',') + + if ((SAMPLE_NAME_KEY not in df.columns) or + (INDEX_1_KEY not in df.columns) or + (INDEX_2_KEY not in df.columns)): + raise ValidationError( + f"Expected at least '{SAMPLE_NAME_KEY}', '{INDEX_1_KEY}', and " + f"'{INDEX_2_KEY}' columns, but got {df.columns}") + + if len(df) == 0: + raise ValidationError("Expected at least one row, but got none") + + df[BARCODE_KEY] = df[INDEX_1_KEY] + "+" + df[INDEX_2_KEY] + return df SurpiCountTableDirectoryFormat = model.SingleFileDirectoryFormat( diff --git a/q2_surpi/citations.bib b/q2_surpi/citations.bib index e69de29..71a9716 100644 --- a/q2_surpi/citations.bib +++ b/q2_surpi/citations.bib @@ -0,0 +1,11 @@ +@ARTICLE{Miller2019, + title={Laboratory validation of a clinical metagenomic sequencing assay for pathogen detection in cerebrospinal fluid}, + author={Miller, Steve and Naccache, Samia N and Samayoa, Erik and Messacar, Kevin and Arevalo, Shaun and Federman, Scot and Stryke, Doug and Pham, Elizabeth and Fung, Becky and Bolosky, William J and Ingebrigtsen, Danielle and Lorizio, Walter and Paff, Sandra M and Leake, John A and Pesano, Rick and DeBiasi, Roberta and Dominguez, Samuel and Chiu, Charles Y}, + journal={Genome Res.}, + publisher={Cold Spring Harbor Laboratory}, + volume={29}, + number={5}, + pages={831--842}, + month={may}, + year={2019} +} \ No newline at end of file diff --git a/q2_surpi/plugin_setup.py b/q2_surpi/plugin_setup.py index a0735c3..70d8eb6 100644 --- a/q2_surpi/plugin_setup.py +++ b/q2_surpi/plugin_setup.py @@ -5,7 +5,8 @@ import q2_surpi from q2_surpi._formats_and_types import ( SurpiCountTable, SurpiCountTableFormat, SurpiCountTableDirectoryFormat, - SurpiSampleSheet, SurpiSampleSheetFormat, SurpiSampleSheetDirectoryFormat) + SurpiSampleSheet, SurpiSampleSheetFormat, SurpiSampleSheetDirectoryFormat, + surpi_count_fp_to_df) plugin = Plugin( @@ -41,7 +42,7 @@ def _1(ff: SurpiCountTableFormat) -> pandas.DataFrame: @plugin.register_transformer # load a SurpiSampleSheetFormat into a dataframe def _2(ff: SurpiSampleSheetFormat) -> pandas.DataFrame: - result = pandas.read_csv(str(ff), sep='\t', header=0) + result = surpi_count_fp_to_df(str(ff)) return result diff --git a/q2_surpi/tests/data/surpi_sample_info.csv b/q2_surpi/tests/data/surpi_sample_info.csv new file mode 100644 index 0000000..882e24c --- /dev/null +++ b/q2_surpi/tests/data/surpi_sample_info.csv @@ -0,0 +1,101 @@ +[Header],,,,,,,,, +,,,,,,,,, +IEMFileVersion,4,,,,,,,, +,,,,,,,,, +Investigator Name,VS,,,,,,,, +,,,,,,,,, +Experiment Name,WW_NS_230126_qiime,,,,,,,, +,,,,,,,,, +Date,1/26/23,,,,,,,, +,,,,,,,,, +Workflow,GenerateFASTQ,,,,,,,, +,,,,,,,,, +Application,FASTQ Only,,,,,,,, +,,,,,,,,, +Assay,NEB,,,,,,,, +,,,,,,,,, +Description,,,,,,,,, +,,,,,,,,, +Chemistry,Amplicon,,,,,,,, +,,,,,,,,, +,,,,,,,,, +,,,,,,,,, +[Reads],,,,,,,,, +,,,,,,,,, +150,,,,,,,,, +,,,,,,,,, +,,,,,,,,, +,,,,,,,,, +[Settings],,,,,,,,, +,,,,,,,,, +ReverseComplement,0,,,,,,,, +,,,,,,,,, +,,,,,,,,, +,,,,,,,,, +[Data],,,,,,,,, +,,,,,,,,, +Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Prep,Type +,,,,,,,,, +sample-R-A1,sample-R-A1,A6,A6,7-337,AGTAGTAA,5-386,TACTGATA,RNA,Analytical +,,,,,,,,, +sample-R-B1,sample-R-B1,B6,B6,7-338,TACTAAGG,5-387,CTGACTCG,RNA,Analytical +,,,,,,,,, +sample-R-C1,sample-R-C1,C6,C6,7-339,CATTCGGA,5-388,GATGGAAA,RNA,Analytical +,,,,,,,,, +sample-R-D1,sample-R-D1,D6,D6,7-340,AATCGTCA,5-389,AGTTAAAG,RNA,Analytical +,,,,,,,,, +sample-R-E1,sample-R-E1,E6,E6,7-341,GCTGATTT,5-390,ATAGAGGC,RNA,Analytical +,,,,,,,,, +sample-R-F1,sample-R-F1,F6,F6,7-342,CGCGAAAG,5-391,AAAGGAGG,RNA,Analytical +,,,,,,,,, +sample-R-G1,sample-R-G1,G6,G6,7-343,TTGCCACT,5-392,GGGAACTG,RNA,Analytical +,,,,,,,,, +sample-R-H1,sample-R-H1,H6,H6,7-344,TTCGTGGA,5-393,ACAAGGTA,RNA,Analytical +,,,,,,,,, +sample-R-A2,sample-R-A2,A7,A7,7-345,AGTCCCGG,5-297,GCAGAAGT,RNA,Analytical +,,,,,,,,, +sample-R-B2,sample-R-B2,B7,B7,7-346,TCCTGGAC,5-298,CTGCTTAA,RNA,Analytical +,,,,,,,,, +sample-R-C2,sample-R-C2,C7,C7,7-347,CTACATGA,5-299,GATTTGAT,RNA,Analytical +,,,,,,,,, +sample-R-D2,sample-R-D2,D7,D7,7-348,CCGGATAG,5-300,GACTCAAA,RNA,Analytical +,,,,,,,,, +sample-R-E2,sample-R-E2,E7,E7,7-349,AACCCGCC,5-301,GAGGATTT,RNA,Analytical +,,,,,,,,, +sample-R-F2,sample-R-F2,F7,F7,7-350,CGAACGTG,5-302,CATCTGTA,RNA,Analytical +,,,,,,,,, +sample-R-G2,sample-R-G2,G7,G7,7-351,CCGTAGAA,5-303,TGCGCTTA,RNA,Analytical +,,,,,,,,, +sample-R-H2,sample-R-H2,H7,H7,7-352,CATCTACT,5-304,TTCCGTTG,RNA,Analytical +,,,,,,,,, +sample-D-A1,sample-D-A1,A8,A8,7-353,AGTCTGCT,5-305,TCCAGGCT,DNA,Analytical +,,,,,,,,, +sample-D-B1,sample-D-B1,B8,B8,7-354,GCCGAATC,5-306,CTGTCCTC,DNA,Analytical +,,,,,,,,, +sample-D-C1,sample-D-C1,C8,C8,7-355,ACTATGAT,5-307,TTCGATAG,DNA,Analytical +,,,,,,,,, +sample-D-D1,sample-D-D1,D8,D8,7-356,CCCTATCT,5-308,GTTAGTGA,DNA,Analytical +,,,,,,,,, +sample-D-E1,sample-D-E1,E8,E8,7-357,CGTTGTCC,5-309,CTTATCGA,DNA,Analytical +,,,,,,,,, +sample-D-F1,sample-D-F1,F8,F8,7-358,TGGAACGG,5-310,GAATAAAG,DNA,Analytical +,,,,,,,,, +sample-D-G1,sample-D-G1,G8,G8,7-359,CCCTTCGG,5-311,GAAGGCAG,DNA,Analytical +,,,,,,,,, +sample-D-H1,sample-D-H1,H8,H8,7-360,TGTCCAAA,5-312,TTGGTTGT,DNA,Analytical +,,,,,,,,, +sample-D-A2,sample-D-A2,A9,A9,7-361,AGTACAAG,5-313,CCCATTGC,DNA,Analytical +,,,,,,,,, +sample-D-B2,sample-D-B2,B9,B9,7-362,TACTGTGA,5-314,GTGTCCAG,DNA,Analytical +,,,,,,,,, +sample-D-C2,sample-D-C2,C9,C9,7-363,CCGGAATT,5-315,GCATACTT,DNA,Analytical +,,,,,,,,, +sample-D-D2,sample-D-D2,D9,D9,7-364,TCGCTCGG,5-316,CCATCGGA,DNA,Analytical +,,,,,,,,, +sample-D-E2,sample-D-E2,E9,E9,7-365,AGTGCGGA,5-317,CCGTTGTC,DNA,Analytical +,,,,,,,,, +sample-D-F2,sample-D-F2,F9,F9,7-366,GCTTCACA,5-318,TAAAGCTA,DNA,Analytical +,,,,,,,,, +sample-D-G2,sample-D-G2,G9,G9,7-367,CCGATCGT,5-319,GACTGTTT,DNA,Analytical +,,,,,,,,, +sample-D-H2,sample-D-H2,H9,H9,7-368,CCGTAAGC,5-320,AGTGAGGT,DNA,Analytical \ No newline at end of file diff --git a/q2_surpi/tests/data/surpi_sample_info.txt b/q2_surpi/tests/data/surpi_sample_info.txt deleted file mode 100644 index b1b5416..0000000 --- a/q2_surpi/tests/data/surpi_sample_info.txt +++ /dev/null @@ -1,33 +0,0 @@ -sample barcode -sample_1 AACCCGCC+GAGGATTT -sample_2 AATCGTCA+AGTTAAAG -sample_3 ACTATGAT+TTCGATAG -sample_4 AGTACAAG+CCCATTGC -sample_5 AGTAGTAA+TACTGATA -sample_6 AGTCCCGG+GCAGAAGT -sample_7 AGTCTGCT+TCCAGGCT -sample_8 AGTGCGGA+CCGTTGTC -sample_9 CATCTACT+TTCCGTTG -sample_10 CATTCGGA+GATGGAAA -sample_11 CCCTATCT+GTTAGTGA -sample_12 CCCTTCGG+GAAGGCAG -sample_13 CCGATCGT+GACTGTTT -sample_14 CCGGAATT+GCATACTT -sample_15 CCGGATAG+GACTCAAA -sample_16 CCGTAAGC+AGTGAGGT -sample_17 CCGTAGAA+TGCGCTTA -sample_18 CGAACGTG+CATCTGTA -sample_19 CGCGAAAG+AAAGGAGG -sample_20 CGTTGTCC+CTTATCGA -sample_21 CTACATGA+GATTTGAT -sample_22 GCCGAATC+CTGTCCTC -sample_23 GCTGATTT+ATAGAGGC -sample_24 GCTTCACA+TAAAGCTA -sample_25 TACTAAGG+CTGACTCG -sample_26 TACTGTGA+GTGTCCAG -sample_27 TCCTGGAC+CTGCTTAA -sample_28 TCGCTCGG+CCATCGGA -sample_29 TGGAACGG+GAATAAAG -sample_30 TGTCCAAA+TTGGTTGT -sample_31 TTCGTGGA+ACAAGGTA -sample_32 TTGCCACT+GGGAACTG \ No newline at end of file diff --git a/q2_surpi/tests/data/surpi_sample_info_empty.txt b/q2_surpi/tests/data/surpi_sample_info_empty.txt deleted file mode 100644 index 51241cb..0000000 --- a/q2_surpi/tests/data/surpi_sample_info_empty.txt +++ /dev/null @@ -1 +0,0 @@ -sample barcode \ No newline at end of file diff --git a/q2_surpi/tests/data/surpi_sample_info_empty_data.csv b/q2_surpi/tests/data/surpi_sample_info_empty_data.csv new file mode 100644 index 0000000..09aa4bf --- /dev/null +++ b/q2_surpi/tests/data/surpi_sample_info_empty_data.csv @@ -0,0 +1,36 @@ +[Header],,,,,,,,, +,,,,,,,,, +IEMFileVersion,4,,,,,,,, +,,,,,,,,, +Investigator Name,VS,,,,,,,, +,,,,,,,,, +Experiment Name,WW_NS_230126_qiime,,,,,,,, +,,,,,,,,, +Date,1/26/23,,,,,,,, +,,,,,,,,, +Workflow,GenerateFASTQ,,,,,,,, +,,,,,,,,, +Application,FASTQ Only,,,,,,,, +,,,,,,,,, +Assay,NEB,,,,,,,, +,,,,,,,,, +Description,,,,,,,,, +,,,,,,,,, +Chemistry,Amplicon,,,,,,,, +,,,,,,,,, +,,,,,,,,, +,,,,,,,,, +[Reads],,,,,,,,, +,,,,,,,,, +150,,,,,,,,, +,,,,,,,,, +,,,,,,,,, +,,,,,,,,, +[Settings],,,,,,,,, +,,,,,,,,, +ReverseComplement,0,,,,,,,, +,,,,,,,,, +,,,,,,,,, +,,,,,,,,, +[Data],,,,,,,,, +,,,,,,,,, \ No newline at end of file diff --git a/q2_surpi/tests/data/surpi_sample_info_missing_barcode.txt b/q2_surpi/tests/data/surpi_sample_info_missing_barcode.txt deleted file mode 100644 index ebb6a9a..0000000 --- a/q2_surpi/tests/data/surpi_sample_info_missing_barcode.txt +++ /dev/null @@ -1,4 +0,0 @@ -sample -sample_1 -sample_2 -sample_3 \ No newline at end of file diff --git a/q2_surpi/tests/data/surpi_sample_info_missing_index.csv b/q2_surpi/tests/data/surpi_sample_info_missing_index.csv new file mode 100644 index 0000000..59e3544 --- /dev/null +++ b/q2_surpi/tests/data/surpi_sample_info_missing_index.csv @@ -0,0 +1,101 @@ +[Header],,,,,,,, +,,,,,,,, +IEMFileVersion,4,,,,,,, +,,,,,,,, +Investigator Name,VS,,,,,,, +,,,,,,,, +Experiment Name,WW_NS_230126_qiime,,,,,,, +,,,,,,,, +Date,1/26/23,,,,,,, +,,,,,,,, +Workflow,GenerateFASTQ,,,,,,, +,,,,,,,, +Application,FASTQ Only,,,,,,, +,,,,,,,, +Assay,NEB,,,,,,, +,,,,,,,, +Description,,,,,,,, +,,,,,,,, +Chemistry,Amplicon,,,,,,, +,,,,,,,, +,,,,,,,, +,,,,,,,, +[Reads],,,,,,,, +,,,,,,,, +150,,,,,,,, +,,,,,,,, +,,,,,,,, +,,,,,,,, +[Settings],,,,,,,, +,,,,,,,, +ReverseComplement,0,,,,,,, +,,,,,,,, +,,,,,,,, +,,,,,,,, +[Data],,,,,,,, +,,,,,,,, +Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,I5_Index_ID,index2,Prep,Type +,,,,,,,, +sample-R-A1,sample-R-A1,A6,A6,7-337,5-386,TACTGATA,RNA,Analytical +,,,,,,,, +sample-R-B1,sample-R-B1,B6,B6,7-338,5-387,CTGACTCG,RNA,Analytical +,,,,,,,, +sample-R-C1,sample-R-C1,C6,C6,7-339,5-388,GATGGAAA,RNA,Analytical +,,,,,,,, +sample-R-D1,sample-R-D1,D6,D6,7-340,5-389,AGTTAAAG,RNA,Analytical +,,,,,,,, +sample-R-E1,sample-R-E1,E6,E6,7-341,5-390,ATAGAGGC,RNA,Analytical +,,,,,,,, +sample-R-F1,sample-R-F1,F6,F6,7-342,5-391,AAAGGAGG,RNA,Analytical +,,,,,,,, +sample-R-G1,sample-R-G1,G6,G6,7-343,5-392,GGGAACTG,RNA,Analytical +,,,,,,,, +sample-R-H1,sample-R-H1,H6,H6,7-344,5-393,ACAAGGTA,RNA,Analytical +,,,,,,,, +sample-R-A2,sample-R-A2,A7,A7,7-345,5-297,GCAGAAGT,RNA,Analytical +,,,,,,,, +sample-R-B2,sample-R-B2,B7,B7,7-346,5-298,CTGCTTAA,RNA,Analytical +,,,,,,,, +sample-R-C2,sample-R-C2,C7,C7,7-347,5-299,GATTTGAT,RNA,Analytical +,,,,,,,, +sample-R-D2,sample-R-D2,D7,D7,7-348,5-300,GACTCAAA,RNA,Analytical +,,,,,,,, +sample-R-E2,sample-R-E2,E7,E7,7-349,5-301,GAGGATTT,RNA,Analytical +,,,,,,,, +sample-R-F2,sample-R-F2,F7,F7,7-350,5-302,CATCTGTA,RNA,Analytical +,,,,,,,, +sample-R-G2,sample-R-G2,G7,G7,7-351,5-303,TGCGCTTA,RNA,Analytical +,,,,,,,, +sample-R-H2,sample-R-H2,H7,H7,7-352,5-304,TTCCGTTG,RNA,Analytical +,,,,,,,, +sample-D-A1,sample-D-A1,A8,A8,7-353,5-305,TCCAGGCT,DNA,Analytical +,,,,,,,, +sample-D-B1,sample-D-B1,B8,B8,7-354,5-306,CTGTCCTC,DNA,Analytical +,,,,,,,, +sample-D-C1,sample-D-C1,C8,C8,7-355,5-307,TTCGATAG,DNA,Analytical +,,,,,,,, +sample-D-D1,sample-D-D1,D8,D8,7-356,5-308,GTTAGTGA,DNA,Analytical +,,,,,,,, +sample-D-E1,sample-D-E1,E8,E8,7-357,5-309,CTTATCGA,DNA,Analytical +,,,,,,,, +sample-D-F1,sample-D-F1,F8,F8,7-358,5-310,GAATAAAG,DNA,Analytical +,,,,,,,, +sample-D-G1,sample-D-G1,G8,G8,7-359,5-311,GAAGGCAG,DNA,Analytical +,,,,,,,, +sample-D-H1,sample-D-H1,H8,H8,7-360,5-312,TTGGTTGT,DNA,Analytical +,,,,,,,, +sample-D-A2,sample-D-A2,A9,A9,7-361,5-313,CCCATTGC,DNA,Analytical +,,,,,,,, +sample-D-B2,sample-D-B2,B9,B9,7-362,5-314,GTGTCCAG,DNA,Analytical +,,,,,,,, +sample-D-C2,sample-D-C2,C9,C9,7-363,5-315,GCATACTT,DNA,Analytical +,,,,,,,, +sample-D-D2,sample-D-D2,D9,D9,7-364,5-316,CCATCGGA,DNA,Analytical +,,,,,,,, +sample-D-E2,sample-D-E2,E9,E9,7-365,5-317,CCGTTGTC,DNA,Analytical +,,,,,,,, +sample-D-F2,sample-D-F2,F9,F9,7-366,5-318,TAAAGCTA,DNA,Analytical +,,,,,,,, +sample-D-G2,sample-D-G2,G9,G9,7-367,5-319,GACTGTTT,DNA,Analytical +,,,,,,,, +sample-D-H2,sample-D-H2,H9,H9,7-368,5-320,AGTGAGGT,DNA,Analytical \ No newline at end of file diff --git a/q2_surpi/tests/data/surpi_sample_info_missing_sample.csv b/q2_surpi/tests/data/surpi_sample_info_missing_sample.csv new file mode 100644 index 0000000..f720e83 --- /dev/null +++ b/q2_surpi/tests/data/surpi_sample_info_missing_sample.csv @@ -0,0 +1,101 @@ +[Header],,,,,,,, +,,,,,,,, +IEMFileVersion,4,,,,,,, +,,,,,,,, +Investigator Name,VS,,,,,,, +,,,,,,,, +Experiment Name,WW_NS_230126_qiime,,,,,,, +,,,,,,,, +Date,1/26/23,,,,,,, +,,,,,,,, +Workflow,GenerateFASTQ,,,,,,, +,,,,,,,, +Application,FASTQ Only,,,,,,, +,,,,,,,, +Assay,NEB,,,,,,, +,,,,,,,, +Description,,,,,,,, +,,,,,,,, +Chemistry,Amplicon,,,,,,, +,,,,,,,, +,,,,,,,, +,,,,,,,, +[Reads],,,,,,,, +,,,,,,,, +150,,,,,,,, +,,,,,,,, +,,,,,,,, +,,,,,,,, +[Settings],,,,,,,, +,,,,,,,, +ReverseComplement,0,,,,,,, +,,,,,,,, +,,,,,,,, +,,,,,,,, +[Data],,,,,,,, +,,,,,,,, +Sample_ID,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Prep,Type +,,,,,,,, +sample-R-A1,A6,A6,7-337,AGTAGTAA,5-386,TACTGATA,RNA,Analytical +,,,,,,,, +sample-R-B1,B6,B6,7-338,TACTAAGG,5-387,CTGACTCG,RNA,Analytical +,,,,,,,, +sample-R-C1,C6,C6,7-339,CATTCGGA,5-388,GATGGAAA,RNA,Analytical +,,,,,,,, +sample-R-D1,D6,D6,7-340,AATCGTCA,5-389,AGTTAAAG,RNA,Analytical +,,,,,,,, +sample-R-E1,E6,E6,7-341,GCTGATTT,5-390,ATAGAGGC,RNA,Analytical +,,,,,,,, +sample-R-F1,F6,F6,7-342,CGCGAAAG,5-391,AAAGGAGG,RNA,Analytical +,,,,,,,, +sample-R-G1,G6,G6,7-343,TTGCCACT,5-392,GGGAACTG,RNA,Analytical +,,,,,,,, +sample-R-H1,H6,H6,7-344,TTCGTGGA,5-393,ACAAGGTA,RNA,Analytical +,,,,,,,, +sample-R-A2,A7,A7,7-345,AGTCCCGG,5-297,GCAGAAGT,RNA,Analytical +,,,,,,,, +sample-R-B2,B7,B7,7-346,TCCTGGAC,5-298,CTGCTTAA,RNA,Analytical +,,,,,,,, +sample-R-C2,C7,C7,7-347,CTACATGA,5-299,GATTTGAT,RNA,Analytical +,,,,,,,, +sample-R-D2,D7,D7,7-348,CCGGATAG,5-300,GACTCAAA,RNA,Analytical +,,,,,,,, +sample-R-E2,E7,E7,7-349,AACCCGCC,5-301,GAGGATTT,RNA,Analytical +,,,,,,,, +sample-R-F2,F7,F7,7-350,CGAACGTG,5-302,CATCTGTA,RNA,Analytical +,,,,,,,, +sample-R-G2,G7,G7,7-351,CCGTAGAA,5-303,TGCGCTTA,RNA,Analytical +,,,,,,,, +sample-R-H2,H7,H7,7-352,CATCTACT,5-304,TTCCGTTG,RNA,Analytical +,,,,,,,, +sample-D-A1,A8,A8,7-353,AGTCTGCT,5-305,TCCAGGCT,DNA,Analytical +,,,,,,,, +sample-D-B1,B8,B8,7-354,GCCGAATC,5-306,CTGTCCTC,DNA,Analytical +,,,,,,,, +sample-D-C1,C8,C8,7-355,ACTATGAT,5-307,TTCGATAG,DNA,Analytical +,,,,,,,, +sample-D-D1,D8,D8,7-356,CCCTATCT,5-308,GTTAGTGA,DNA,Analytical +,,,,,,,, +sample-D-E1,E8,E8,7-357,CGTTGTCC,5-309,CTTATCGA,DNA,Analytical +,,,,,,,, +sample-D-F1,F8,F8,7-358,TGGAACGG,5-310,GAATAAAG,DNA,Analytical +,,,,,,,, +sample-D-G1,G8,G8,7-359,CCCTTCGG,5-311,GAAGGCAG,DNA,Analytical +,,,,,,,, +sample-D-H1,H8,H8,7-360,TGTCCAAA,5-312,TTGGTTGT,DNA,Analytical +,,,,,,,, +sample-D-A2,A9,A9,7-361,AGTACAAG,5-313,CCCATTGC,DNA,Analytical +,,,,,,,, +sample-D-B2,B9,B9,7-362,TACTGTGA,5-314,GTGTCCAG,DNA,Analytical +,,,,,,,, +sample-D-C2,C9,C9,7-363,CCGGAATT,5-315,GCATACTT,DNA,Analytical +,,,,,,,, +sample-D-D2,D9,D9,7-364,TCGCTCGG,5-316,CCATCGGA,DNA,Analytical +,,,,,,,, +sample-D-E2,E9,E9,7-365,AGTGCGGA,5-317,CCGTTGTC,DNA,Analytical +,,,,,,,, +sample-D-F2,F9,F9,7-366,GCTTCACA,5-318,TAAAGCTA,DNA,Analytical +,,,,,,,, +sample-D-G2,G9,G9,7-367,CCGATCGT,5-319,GACTGTTT,DNA,Analytical +,,,,,,,, +sample-D-H2,H9,H9,7-368,CCGTAAGC,5-320,AGTGAGGT,DNA,Analytical \ No newline at end of file diff --git a/q2_surpi/tests/data/surpi_sample_info_missing_sample.txt b/q2_surpi/tests/data/surpi_sample_info_missing_sample.txt deleted file mode 100644 index cc797b1..0000000 --- a/q2_surpi/tests/data/surpi_sample_info_missing_sample.txt +++ /dev/null @@ -1,4 +0,0 @@ -barcode -AACCCGCC+GAGGATTT -AATCGTCA+AGTTAAAG -ACTATGAT+TTCGATAG \ No newline at end of file diff --git a/q2_surpi/tests/test_formats.py b/q2_surpi/tests/test_formats.py index 5b2ab5b..d0c24c4 100644 --- a/q2_surpi/tests/test_formats.py +++ b/q2_surpi/tests/test_formats.py @@ -33,7 +33,7 @@ class TestSurpiSampleSheetFormat(TestPluginBase): package = f'{__package_name__}.tests' def test_surpisamplesheet_format_valid(self): - filenames = ['surpi_sample_info.txt'] + filenames = ['surpi_sample_info.csv'] filepaths = [self.get_data_path(filename) for filename in filenames] @@ -43,12 +43,12 @@ def test_surpisamplesheet_format_valid(self): def test_surpisamplesheet_format_invalid(self): filenames = [ - # empty - 'surpi_sample_info_empty.txt', - # missing column "sample" - 'surpi_sample_info_missing_sample.txt', - # missing column "barcode" - 'surpi_sample_info_missing_barcode.txt' + # empty data section + 'surpi_sample_info_empty_data.csv', + # missing column "Sample_Name" + 'surpi_sample_info_missing_sample.csv', + # missing column "index" + 'surpi_sample_info_missing_index.csv' ] filepaths = [self.get_data_path(filename) for filename in filenames] diff --git a/q2_surpi/tests/test_transformers.py b/q2_surpi/tests/test_transformers.py index e68885d..69649ba 100644 --- a/q2_surpi/tests/test_transformers.py +++ b/q2_surpi/tests/test_transformers.py @@ -65,34 +65,34 @@ class TestSurpiSampleSheetFormatTransformers(TestPluginBase): package = f'{__package_name__}.tests' def test_surpisamplesheetformat_to_dataframe(self): - input_fname = "surpi_sample_info.txt" + input_fname = "surpi_sample_info.csv" expected_dict = { SAMPLE_NAME_KEY: [ - "sample_1", "sample_2", "sample_3", "sample_4", "sample_5", - "sample_6", "sample_7", "sample_8", "sample_9", "sample_10", - "sample_11", "sample_12", "sample_13", "sample_14", - "sample_15", "sample_16", "sample_17", "sample_18", - "sample_19", "sample_20", "sample_21", "sample_22", - "sample_23", "sample_24", "sample_25", "sample_26", - "sample_27", "sample_28", "sample_29", "sample_30", - "sample_31", "sample_32"], - BARCODE_KEY: ["AACCCGCC+GAGGATTT", "AATCGTCA+AGTTAAAG", - "ACTATGAT+TTCGATAG", "AGTACAAG+CCCATTGC", - "AGTAGTAA+TACTGATA", "AGTCCCGG+GCAGAAGT", - "AGTCTGCT+TCCAGGCT", "AGTGCGGA+CCGTTGTC", - "CATCTACT+TTCCGTTG", "CATTCGGA+GATGGAAA", - "CCCTATCT+GTTAGTGA", "CCCTTCGG+GAAGGCAG", - "CCGATCGT+GACTGTTT", "CCGGAATT+GCATACTT", - "CCGGATAG+GACTCAAA", "CCGTAAGC+AGTGAGGT", - "CCGTAGAA+TGCGCTTA", "CGAACGTG+CATCTGTA", - "CGCGAAAG+AAAGGAGG", "CGTTGTCC+CTTATCGA", - "CTACATGA+GATTTGAT", "GCCGAATC+CTGTCCTC", - "GCTGATTT+ATAGAGGC", "GCTTCACA+TAAAGCTA", - "TACTAAGG+CTGACTCG", "TACTGTGA+GTGTCCAG", - "TCCTGGAC+CTGCTTAA", "TCGCTCGG+CCATCGGA", - "TGGAACGG+GAATAAAG", "TGTCCAAA+TTGGTTGT", - "TTCGTGGA+ACAAGGTA", "TTGCCACT+GGGAACTG"], + "sample-R-A1", "sample-R-B1", "sample-R-C1", "sample-R-D1", + "sample-R-E1", "sample-R-F1", "sample-R-G1", "sample-R-H1", + "sample-R-A2", "sample-R-B2", "sample-R-C2", "sample-R-D2", + "sample-R-E2", "sample-R-F2", "sample-R-G2", "sample-R-H2", + "sample-D-A1", "sample-D-B1", "sample-D-C1", "sample-D-D1", + "sample-D-E1", "sample-D-F1", "sample-D-G1", "sample-D-H1", + "sample-D-A2", "sample-D-B2", "sample-D-C2", "sample-D-D2", + "sample-D-E2", "sample-D-F2", "sample-D-G2", "sample-D-H2"], + BARCODE_KEY: ["AGTAGTAA+TACTGATA", "TACTAAGG+CTGACTCG", + "CATTCGGA+GATGGAAA", "AATCGTCA+AGTTAAAG", + "GCTGATTT+ATAGAGGC", "CGCGAAAG+AAAGGAGG", + "TTGCCACT+GGGAACTG", "TTCGTGGA+ACAAGGTA", + "AGTCCCGG+GCAGAAGT", "TCCTGGAC+CTGCTTAA", + "CTACATGA+GATTTGAT", "CCGGATAG+GACTCAAA", + "AACCCGCC+GAGGATTT", "CGAACGTG+CATCTGTA", + "CCGTAGAA+TGCGCTTA", "CATCTACT+TTCCGTTG", + "AGTCTGCT+TCCAGGCT", "GCCGAATC+CTGTCCTC", + "ACTATGAT+TTCGATAG", "CCCTATCT+GTTAGTGA", + "CGTTGTCC+CTTATCGA", "TGGAACGG+GAATAAAG", + "CCCTTCGG+GAAGGCAG", "TGTCCAAA+TTGGTTGT", + "AGTACAAG+CCCATTGC", "TACTGTGA+GTGTCCAG", + "CCGGAATT+GCATACTT", "TCGCTCGG+CCATCGGA", + "AGTGCGGA+CCGTTGTC", "GCTTCACA+TAAAGCTA", + "CCGATCGT+GACTGTTT", "CCGTAAGC+AGTGAGGT"], } expected_df = pandas.DataFrame(expected_dict) @@ -101,4 +101,7 @@ def test_surpisamplesheetformat_to_dataframe(self): SurpiSampleSheetFormat, pandas.DataFrame, filename=input_fname) - assert_frame_equal(obs_df, expected_df) + # the only parts of the sample sheet that are used are the sample name + # and the barcode, so we only compare those columns + partial_obs_df = obs_df[[SAMPLE_NAME_KEY, BARCODE_KEY]] + assert_frame_equal(partial_obs_df, expected_df)