forked from lemieuxl/pyGenClean
-
Notifications
You must be signed in to change notification settings - Fork 0
/
configuration_example_1_of_2.ini
191 lines (148 loc) · 6.7 KB
/
configuration_example_1_of_2.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# This is the first part of example configuration files for performing efficient
# data clean up. All commented out parameters are those that are used by
# default.
[1]
# ##############################################################################
# Checks sample contamination using the bafRegress tool
# (http://genome.sph.umich.edu/wiki/BAFRegress). Field name can be modify using
# options (as describe below).
# ##############################################################################
script = contamination
raw-dir = /PATH/TO/DIRECTORY/CONTAINING/INTENSITIES.txt
# colsample = Sample Name
# colmarker = SNP Name
# colbaf = B Allele Freq
# colab1 = Allele1 - AB
# colab2 = Allele2 - AB
# sge
# sge-walltime = WRITE WALLTIME ONLY IF REQUIRED
# sge-nodes = WRITE NB NODES AND NB PROCESSOR PER NODE ONLY IF REQUIRED
# sample-per-run-for-sge = 30
[2]
# ##############################################################################
# Checks missing rate and pairwise concordance of duplicated samples. Duplicated
# samples should have same family and individual identification numbers. The
# names can be modified directly in the transposed pedfile.
# ##############################################################################
script = duplicated_samples
# sample-completion-threshold = 0.9
# sample-concordance-threshold = 0.97
[3]
# ##############################################################################
# Checks missing rate and pairwise concordance of duplicated markers. Duplicated
# markers are found by looking at their chromosomal position. No modification of
# the transposed bedfile is required.
# ##############################################################################
script = duplicated_snps
# snp-completion-threshold = 0.9
# snp-concordance-threshold = 0.98
# frequency_difference = 0.05
[4]
# ##############################################################################
# Finds and removes markers which have a missing rate of 100% or markers (not
# located on mitochondrial chromosome) that have a heterozygosity rate of 0%.
# ##############################################################################
script = noCall_hetero_snps
[5]
# ##############################################################################
# Removes sample with a missing rate higher than a user defined threshold. For
# this step, we recommend using a threshold of 10% missing rate as samples with
# a missing rate of 2% will be later removed.
# ##############################################################################
script = sample_missingness
# mind = 0.1
[6]
# ##############################################################################
# Removes markers with a missing rate higher than a user defined threshold. For
# this step, we recommend using a threshold of 2% missing rate.
# ##############################################################################
script = snp_missingness
# geno = 0.02
[7]
# ##############################################################################
# Removes sample with a missing rate higher than a user defined threshold. For
# this step, we recommend using a threshold of 2% missing rate.
# ##############################################################################
script = sample_missingness
mind = 0.02
[8]
# ##############################################################################
# Using PLINK, finds samples with gender issues, according to heterozygosity
# rate on the X chromosome. If you want to produce a gender plot, you need to
# uncomment the "gender-plot" option and provide a file containing marker
# intensities on the X and Y chromosomes. If you want to produce a BAF and LRR
# plot, you need to uncomment the "lrr-baf" option and provide a directory
# containing the BAF and LRR values of each marker on the X and Y chromosomes
# (one file per sample).
# ##############################################################################
script = sex_check
# femaleF = 0.3
# maleF = 0.7
# nbChr23 = 50
# gender-plot
# sex-chr-intensities = /PATH/TO/FILE/CONTAINING/INTENSITIES_FILE.txt
# gender-plot-format = png
# lrr-baf
# lrr-baf-raw-dir = /PATH/TO/DIRECTORY/CONTAINING/BAF_LRR_FILES.txt
# lrr-baf-format = png
# lrr-baf-dpi = 300
[9]
# ##############################################################################
# Using PLINK, performs a plate bias analysis, using a p value threshold of
# 1.0e-7.
# ##############################################################################
script = plate_bias
loop-assoc = /PATH/TO/FILE/CONTAINING/PLATE_INFORMATION.txt
# pfilter = 1.0e-07
[10]
# ##############################################################################
# Checks for related individual and randomly keeps one of each related group. If
# you have a server with a DRMAA-compliant distributed resource management
# system, you can uncomment the "sge" and the "line-per-file-for-sge" options,
# to run this step in parallel.
# ##############################################################################
script = find_related_samples
# min-nb-snp = 10000
# indep-pairwise = 50 5 0.1
# maf = 0.05
# ibs2-ratio = 0.8
# sge
# line-per-file-for-sge = 100
# sge-walltime = WRITE WALLTIME ONLY IF REQUIRED
# sge-nodes = WRITE NB NODES AND NB PROCESSOR PER NODE ONLY IF REQUIRED
[11]
# ##############################################################################
# Using PLINK, computes the MDS value of each sample, and using three reference
# populations (CEU, YRI and JPT-CHB), finds outliers of one of those three
# reference population. You might want to skip the reference population using
# the "skip-ref-pops" option. You might need to change the "multiplier" option
# to be more or less stringent, according to you dataset. If you have a server
# with a DRMAA-compliant distributed resource management system, you can
# uncomment the "sge" and the "line-per-file-for-sge" options, to run this step
# in parallel.
# ##############################################################################
script = check_ethnicity
ceu-bfile = /PATH/TO/PLINK/BINARY/FILE/FOR/CEU_population
yri-bfile = /PATH/TO/PLINK/BINARY/FILE/FOR/YRI_population
jpt-chb-bfile = /PATH/TO/PLINK/BINARY/FILE/FOR/JPT-CHB_population
# skip-ref-pops
# min-nb-snp = 8000
# indep-pairwise = 50 5 0.1
# maf = 0.05
# sge
# line-per-file-for-sge = 100
# nb-components = 10
# outliers-of = CEU
# multiplier = 1.9
# xaxis = C1
# yaxis = C2
# format = png
# title = "C2 in function of C1 - MDS"
# xlabel = C1
# ylabel = C2
# create-scree-plot
# scree-plot-title "TITLE OF THE PLOT"
# sge-walltime = WRITE WALLTIME ONLY IF REQUIRED
# sge-nodes = WRITE NB NODES AND NB PROCESSOR PER NODE ONLY IF REQUIRED
# ibs-sge-walltime = WRITE WALLTIME ONLY IF REQUIRED
# ibs-sge-nodes = WRITE NB NODES AND NB PROCESSOR PER NODE ONLY IF REQUIRED