-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support for heterozygosity in ABH format. #52
Labels
Comments
Current solution (in the interest of time): alter the ABH format on a development site and provide the file that way. I am including the patch here done by @carolyncaron as guidance to anyone trying to implement this at a future date. diff --git a/includes/vcf_filter.trpdownload.abh.inc b/includes/vcf_filter.trpdownload.abh.inc
index 92d6139..9f77d32 100755
--- a/includes/vcf_filter.trpdownload.abh.inc
+++ b/includes/vcf_filter.trpdownload.abh.inc
@@ -54,46 +54,46 @@ function vcf_filter_abh_generate_file($variables, $job_id = NULL) {
// Filter
//================
-
- /**
+
+ /**
* Need to modify user input for germplams filter because maternal/paternal paternents may not include in user's input
* read all germplasm names from filter_criteria-restrict_dataset-germplasm_available
* obtain paternal/maternal germplasm names from available and update to select
* TESTED: it's fine to have repeated germplasm names in a list, vcftools can deal with it
*/
if (!empty($variables['q']['restrict_dataset']['germplasm_select'])){
-
+
$filter_restrict_dataset_germplasm_all = $variables['q']['restrict_dataset']['germplasm_available'];
$filter_restrict_dataset_germplasm_all = trim($filter_restrict_dataset_germplasm_all);
$filter_restrict_dataset_germplasm_all = explode("\n", $filter_restrict_dataset_germplasm_all);
-
+
$filter_restrict_dataset_germplasm_user_select = $variables['q']['restrict_dataset']['germplasm_select'];
$filter_restrict_dataset_germplasm_user_select = trim($filter_restrict_dataset_germplasm_user_select);
$filter_restrict_dataset_germplasm_user_select = explode("\n", $filter_restrict_dataset_germplasm_user_select);
-
+
$partent_added = 0;
$two_partents = array();
while ($partent_added < 2){
$one_partent = array_shift($filter_restrict_dataset_germplasm_all);
-
+
if (preg_match('/^#/', $one_partent)){
continue;
}
-
+
if (preg_match('/^$/', $one_partent)){
continue;
}
-
+
array_push($two_partents, $one_partent);
$partent_added = $partent_added + 1;
-
+
}
array_unshift($filter_restrict_dataset_germplasm_user_select, $two_partents[1]);
array_unshift($filter_restrict_dataset_germplasm_user_select, $two_partents[0]);
$filter_restrict_dataset_germplasm_user_select = implode("\n", $filter_restrict_dataset_germplasm_user_select);
$variables['q']['restrict_dataset']['germplasm_select'] = $filter_restrict_dataset_germplasm_user_select;
}
-
+
print "\nFiltering the Original VCF\n";
$filtered_vcf = vcf_filter_filter_file($vcf_file->file_path, $variables['q'], $drush_use_limited);
if (!$filtered_vcf) {
@@ -254,6 +254,10 @@ function vcf_filter_convert_VCF_to_ABH($input_file, $maternal_parent, $paternal_
continue;
}
$converted_line = array($chrom, $pos);
+ // Declare the Mom and Dad variables to something arbitrary- these are only used to keep track
+ // of heterozygous parents
+ $mom = "M";
+ $dad = "D";
foreach ($sample_names as $current_sample) {
@@ -268,6 +272,7 @@ function vcf_filter_convert_VCF_to_ABH($input_file, $maternal_parent, $paternal_
// Now the calls are in an array
$genotype_calls = $marker[$current_sample]['GT'];
+ // CHECK FOR MATERNAL GENOTYPE
if (strcmp($current_sample, $maternal_parent) === 0) {
if (($genotype_calls[0] === '.') || ($genotype_calls[1] === '.')) {
// The maternal parent is missing a genotype, thus skip this site completely
@@ -276,78 +281,138 @@ function vcf_filter_convert_VCF_to_ABH($input_file, $maternal_parent, $paternal_
continue 2;
}
else if (($genotype_calls[0] !== $genotype_calls[1])) {
- // The maternal parent must be heterozygous, also skip this site.
- #print "Skipping site: ( $chrom $pos ) due to a heterozygous call in $maternal_parent\n";
+ // The maternal parent must be heterozygous
+ $mom = "H";
$skipping_site_count["heterozygous_call_maternal"]++;
- continue 2;
- }
- $maternal_genotype = $genotype_calls;
- array_push($converted_line, "A");
- continue;
+ }
+ $maternal_genotype = $genotype_calls;
+ continue;
}
+
+ // CHECK FOR PATERNAL GENOTYPE
+ // Both Mom and Dad are assigned symbols in this statement block
else if (strcmp($current_sample, $paternal_parent) === 0) {
if (($genotype_calls[0] === '.') || ($genotype_calls[1] === '.')) {
// The paternal parent is missing a genotype, thus skip this site completely
#print "Skipping site: ( $chrom $pos ) due to missing call in $paternal_parent\n";
$skipping_site_count["missing_call_paternal"]++;
continue 2;
- }
- else if (($genotype_calls[0] !== $genotype_calls[1])) {
- // The paternal parent must be heterozygous, also skip this site.
- #print "Skipping site: ( $chrom $pos ) due to a heterozygous call in $paternal_parent\n";
- $skipping_site_count["heterozygous_call_paternal"]++;
- continue 2;
- }
- else if (($genotype_calls === $maternal_genotype)) {
- // Dad has the same genotype as Mom? That can't be right. Skip this too.
- #print "Skipping site: ( $chrom $pos ) due to parents having matching genotype: $genotype_calls[0]/$genotype_calls[1]\n";
+ }
+
+ // If Mom and Dad are equal, then check first for heterozygosity. Otherwise assign "S" to both
+ if (($genotype_calls === $maternal_genotype)) {
+ if ($mom == "H") {
+ array_push($converted_line, "H");
+ array_push($converted_line, "H");
+ }
+ else {
+ array_push($converted_line, "S");
+ array_push($converted_line, "S");
+ $mom = "S";
+ }
$skipping_site_count["parents_matching_genotype"]++;
- continue 2;
- }
+ }
+ // Now that we ruled out missing/same genotypes, check if mom is het and print her
+ // symbol before we print dad's
+ else {
+ // Print Mom
+ if ($mom == "H") {
+ array_push($converted_line, "H");
+ }
+ else {
+ array_push($converted_line, "A");
+ }
+ // Print Dad
+ if ($genotype_calls[0] === $genotype_calls[1]) {
+ array_push($converted_line, "B");
+ }
+ else {
+ // Dad must be hetero
+ $dad = "H";
+ array_push($converted_line, "H");
+ $skipping_site_count["heterozygous_call_paternal"]++;
+ }
+ }
$paternal_genotype = $genotype_calls;
- array_push($converted_line, "B");
continue;
}
- // Translate the alleles to:
+ // Check if the (non-parental) call is missing
+ if (($genotype_calls[0] === ".") || ($genotype_calls[1] === ".")) {
+ array_push($converted_line, "-");
+ continue;
+ }
+
+ // If Mom is "S" then Dad is too. Assign S,H or O to the calls for the sublines, not A or B.
+ if ($mom == "S") {
+ if ($genotype_calls[0] == $maternal_genotype[0]) {
+ if ($genotype_calls[1] == $maternal_genotype[1]) {
+ array_push($converted_line, "S");
+ }
+ else {
+ array_push($converted_line, "H");
+ }
+ }
+ else {
+ if ($genotype_calls[1] == $maternal_genotype[1]) {
+ array_push($converted_line, "H");
+ }
+ else {
+ array_push($converted_line, "O");
+ }
+ }
+ continue;
+ }
+
+ // If mom and dad are not the same, then translate the alleles to:
// A if it matches the maternal parent
// B if it matches the paternal parent
// H if it is heterozygous
// - if it is missing
if ($genotype_calls[0] == $maternal_genotype[0]) {
if ($genotype_calls[1] == $maternal_genotype[1]) {
- array_push($converted_line, "A");
+ if ($mom == "H") {
+ array_push($converted_line, "H");
+ }
+ else {
+ array_push($converted_line, "A");
+ }
}
else if ($genotype_calls[1] == $paternal_genotype[1]) {
array_push($converted_line, "H");
}
else {
- array_push($converted_line, "-");
+ array_push($converted_line, "O");
}
}
else if ($genotype_calls[0] == $paternal_genotype[0]) {
if ($genotype_calls[1] == $paternal_genotype[1]) {
- array_push($converted_line, "B");
+ if ($dad == "H") {
+ array_push($converted_line, "H");
+ }
+ else {
+ array_push($converted_line, "B");
+ }
}
else if ($genotype_calls[1] == $maternal_genotype[1]) {
array_push($converted_line, "H");
}
else {
- array_push($converted_line, "-");
+ array_push($converted_line, "O");
}
}
else {
- array_push($converted_line, "-");
+ array_push($converted_line, "O");
}
}
// And print it to the output file!
fputcsv($OUTFILE, $converted_line, "\t");
-
+
//delete inpute file to leave a clean track
}
unlink($input_file);
-
+
// its too verbose to print one line for each skipping site, myabe a summary
print $skipping_site_count["multiple_alternate_alleles"]." sites skipped due to multiple alternate alleles.\n";
print $skipping_site_count["missing_call_maternal"]." sites skipped due to missing call in maternal parent.\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
One of our researchers requested that ABH format support heterogeneity/heterozygosity in the parents/individuals. Since many of our varieties have built in heterogeneity this is not an unreasonable request. However, most researchers do not want this feature as many programs do not support this altered format.
This is what the format looks like:
Ideal solution: Add a filter criteria which only shows up for ABH format and allows heterozygosity to be included. This filter would be un-selected by default
The text was updated successfully, but these errors were encountered: