Skip to content

Commit

Permalink
psam and pvar files
Browse files Browse the repository at this point in the history
  • Loading branch information
biona001 committed Sep 7, 2022
1 parent 09e0b33 commit a667de0
Showing 1 changed file with 42 additions and 14 deletions.
56 changes: 42 additions & 14 deletions src/write.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,23 @@
write_PGEN(pgen_filename, x, sampleID, variantID)
Saves numeric matrix `x` into a PGEN formatted file. We assume `x`
stores dosage diploid genotype (i.e. `x[i, j] ∈ [0, 2]`). As such, the code
below implements a "fixed-width storage mode" which can write a standalone
PGEN file in a single sequential pass.
stores dosage diploid genotype (i.e. `x[i, j] ∈ [0, 2]`). Note all hard-call
genotype values in the resulting PGEN file are all stored as missing.
# Inputs
+ `pgen_filename`: Output filename of PGEN file
+ `x`: Numeric matrix, each row is a sample and each column is a SNPs. It is assumed
`x[i, j] ∈ [0, 2]` and missing values are represented either with
`x[i,j] === missing` or `x[i,j] === NaN`.
+ `sampleID`: Vector storing each sample's ID
+ `variantID`: Vector storing each variant's ID
+ `x`: Numeric matrix, each row is a sample and each column is a SNP. `x[i, j]`
is considered missing if `ismissing(x[i,j]) == true` or `isnan(x[i,j]) == true`.
+ `FID`: Vector storing each sample's family ID, defaults to 0
+ `IID`: Vector storing each sample's individual ID, defaults to integer between 1-n
+ `PAT`: Vector storing each sample's father ID, defaults to 0
+ `MAT`: Vector storing each sample's mother ID, defaults to 0
+ `SEX`: Vector storing each sample's sex, defaults to "NA"
+ `CHROM`: Vector storing each variant's chromosome number, defaults to 1
+ `ID`: Vector storing each variant's ID, detaults to snp1, snp2,... etc
+ `POS`: Vector storing each variant's position, defaults to 1, 2, ...
+ `ALT`: Vector storing each variant's alternate alleles, defaults to A
+ `REF`: Vector storing each variant's reference allele, defaults to T
# PGEN format structure
----Header-----
Expand All @@ -26,15 +32,25 @@ PGEN file in a single sequential pass.
"""
function write_PGEN(
pgen_filename::AbstractString,
x::AbstractMatrix,
# sampleID::AbstractVector,
# variantID::AbstractVector
x::AbstractMatrix;
# psam values
FID::AbstractVector = zeros(Int, size(x, 1)),
IID::AbstractVector = collect(1:size(x, 1)),
PAT::AbstractVector = zeros(Int, size(x, 1)),
MAT::AbstractVector = zeros(Int, size(x, 1)),
SEX::AbstractVector = ["NA" for i in 1:size(x, 1)],
# pvar values
CHROM::AbstractVector = ones(Int, size(x, 2)),
ID::AbstractVector = ["snp$i" for i in 1:size(x, 2)],
POS::AbstractVector = collect(1:size(x, 2)),
ALT::AbstractVector = ["A" for i in 1:size(x, 2)],
REF::AbstractVector = ["T" for i in 1:size(x, 2)],
)
n_samples, n_variants = size(x)
n_blocks = PGENFiles.ceil_int(n_variants, 2^16) #Int(ceil(n_variants / 2 ^ 16))
bytes_written = 0
# main PGEN file
open(pgen_filename, "w") do io
open(pgen_filename * ".pgen", "w") do io
#
# Construct header
#
Expand Down Expand Up @@ -100,7 +116,19 @@ function write_PGEN(
end
end
# handle psam file
# handle pvar file
open(pgen_filename * ".psam", "w") do io
println(io, "#FID\tIID\tPAT\tMAT\tSEX")
for i in 1:n_samples
println(io, FID[i], '\t', IID[i], '\t', PAT[i], '\t', MAT[i], '\t', SEX[i])
end
end
# handle pvar file (consider output a zs-compressed pvar file by default)
open(pgen_filename * ".pvar", "w") do io
println(io, "#CHROM\tID\tPOS\tALT\tREF")
for i in 1:n_variants
println(io, CHROM[i], '\t', ID[i], '\t', POS[i], '\t', ALT[i], '\t', REF[i])
end
end
return bytes_written
end

Expand All @@ -112,7 +140,7 @@ function write_variant_record(io, xj::AbstractVector) # xj is the jth column of
bytes_written += write(io, 0xff) # 11 11 11 11
end
leftover = N % 4
if N % 4 > 0
if leftover > 0
bytes_written += write(io, bitstring2byte("0"^(8 - 2leftover) * "1"^2leftover))
end
# track #4
Expand Down

0 comments on commit a667de0

Please sign in to comment.