From 08e2c4b906f4cae673f88d42b4acf724295e0bb8 Mon Sep 17 00:00:00 2001 From: Philipp Bayer Date: Tue, 27 Mar 2018 14:49:13 +0800 Subject: [PATCH 1/5] First piece of work on the preparsing file recognition --- app/workers/preparsing.rb | 54 ++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/app/workers/preparsing.rb b/app/workers/preparsing.rb index 0f4e2d16..84b5c997 100644 --- a/app/workers/preparsing.rb +++ b/app/workers/preparsing.rb @@ -1,7 +1,14 @@ # frozen_string_literal: true require 'zip' +require 'zlib' +require 'rubygems/package' require 'digest' +class Nested + # a weird hack to get a nested method call working like I want it to + def myreader(zipfile) + + class Preparsing include Sidekiq::Worker # only retry 10 times - after that, the genotyping probably has already been deleted @@ -10,11 +17,41 @@ class Preparsing def perform(genotype_id) genotype = Genotype.find(genotype_id) - logger.info "Starting preparse" - biggest = '' - biggest_size = 0 - begin - Zip::File.open(genotype.genotype.path) do |zipfile| + logger.info "Starting preparse on #{genotype.genotype.path}" + # First, we need to find out which archive or flat text our uploaded file is! + # We use the bash tool file for that + # + # There are two possible outcomes - file is a collection of files (tar, tar.gz, zip) + # or file is a single file (ASCII, gz) + filetype = %x{file #{genotype.genotype.path}} + case filetype + when /ASCII text/ + logger.info "File is flat text" + reader = File.method("open") + is_collection = False + when /gzip compressed data, was/ + reader = Zlib::GzipReader.method("open") + logger.info "File is gz" + is_collection = False + when /gzip compressed data, last modified/ + reader = lambda { |zipfile| Gem::Package::TarReader.new(Zlib::GzipReader.open(zipfile)) } + is_collection = True + when /POSIX tar archive/ + logger.info "File is tar" + reader = Gem::Package::TarReader.method("new") + is_collection = True + when /Zip archive data/ + logger.info "File is zip" + reader = Zip::File.method("open") + is_collection = True + end + + + if is_collection + # Find the biggest file in the archive + biggest = '' + biggest_size = 0 + reader.call genotype.genotype.path do |zipfile| # find the biggest file, since that's going to be the genotyping zipfile.each do |entry| if entry.size > biggest_size @@ -27,14 +64,13 @@ def perform(genotype_id) system("mv #{Rails.root}/tmp/#{genotype.fs_filename}.csv #{Rails.root}/public/data/#{genotype.fs_filename}") logger.info "copied file" end - - rescue - logger.info "nothing to unzip, seems to be a text-file in the first place" + else + system("cp #{genotype.genotype.path} #{Rails.root}/public/data/#{genotype.fs_filename}") end # now that they are unzipped, check if they're actually proper files file_is_ok = false - fh = File.open(genotype.genotype.path) + fh = File.open "#{Rails.root}/public/data/#{genotype.fs_filename}" l = fh.readline() # some files, for some reason, start with the UTF-BOM-marker l = l.sub("\uFEFF","") From 276b593e2e5b7eaaa8454c7fb6a3c8b1b3956221 Mon Sep 17 00:00:00 2001 From: Philipp Bayer Date: Tue, 27 Mar 2018 14:50:56 +0800 Subject: [PATCH 2/5] Fixed python True/False with ruby true/false --- app/workers/preparsing.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/app/workers/preparsing.rb b/app/workers/preparsing.rb index 84b5c997..cea0218a 100644 --- a/app/workers/preparsing.rb +++ b/app/workers/preparsing.rb @@ -28,22 +28,22 @@ def perform(genotype_id) when /ASCII text/ logger.info "File is flat text" reader = File.method("open") - is_collection = False + is_collection = false when /gzip compressed data, was/ reader = Zlib::GzipReader.method("open") logger.info "File is gz" - is_collection = False + is_collection = false when /gzip compressed data, last modified/ reader = lambda { |zipfile| Gem::Package::TarReader.new(Zlib::GzipReader.open(zipfile)) } - is_collection = True + is_collection = true when /POSIX tar archive/ logger.info "File is tar" reader = Gem::Package::TarReader.method("new") - is_collection = True + is_collection = true when /Zip archive data/ logger.info "File is zip" reader = Zip::File.method("open") - is_collection = True + is_collection = true end From 9369ad02f7fd1c931458de1090d8eac86143a13d Mon Sep 17 00:00:00 2001 From: Philipp Bayer Date: Tue, 27 Mar 2018 15:00:41 +0800 Subject: [PATCH 3/5] Accidentally left part of hack in --- app/workers/preparsing.rb | 5 ----- 1 file changed, 5 deletions(-) diff --git a/app/workers/preparsing.rb b/app/workers/preparsing.rb index cea0218a..fc9dd527 100644 --- a/app/workers/preparsing.rb +++ b/app/workers/preparsing.rb @@ -4,11 +4,6 @@ require 'rubygems/package' require 'digest' -class Nested - # a weird hack to get a nested method call working like I want it to - def myreader(zipfile) - - class Preparsing include Sidekiq::Worker # only retry 10 times - after that, the genotyping probably has already been deleted From 610c56f42fc7b2535af1aa92adfa24c0b4ee7a02 Mon Sep 17 00:00:00 2001 From: Philipp Bayer Date: Tue, 27 Mar 2018 15:20:36 +0800 Subject: [PATCH 4/5] Satisfy hound --- app/workers/preparsing.rb | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/app/workers/preparsing.rb b/app/workers/preparsing.rb index fc9dd527..b07acd48 100644 --- a/app/workers/preparsing.rb +++ b/app/workers/preparsing.rb @@ -18,30 +18,29 @@ def perform(genotype_id) # # There are two possible outcomes - file is a collection of files (tar, tar.gz, zip) # or file is a single file (ASCII, gz) - filetype = %x{file #{genotype.genotype.path}} + filetype = `file #{genotype.genotype.path}` case filetype when /ASCII text/ - logger.info "File is flat text" - reader = File.method("open") + logger.info 'File is flat text' + reader = File.method('open') is_collection = false when /gzip compressed data, was/ - reader = Zlib::GzipReader.method("open") - logger.info "File is gz" + reader = Zlib::GzipReader.method('open') + logger.info 'file is gz' is_collection = false when /gzip compressed data, last modified/ - reader = lambda { |zipfile| Gem::Package::TarReader.new(Zlib::GzipReader.open(zipfile)) } + reader = -> (zipfile) { Gem::Package::TarReader.new(Zlib::GzipReader.open(zipfile)) } is_collection = true when /POSIX tar archive/ - logger.info "File is tar" - reader = Gem::Package::TarReader.method("new") + logger.info 'File is tar' + reader = Gem::Package::TarReader.method('new') is_collection = true when /Zip archive data/ - logger.info "File is zip" - reader = Zip::File.method("open") + logger.info 'File is zip' + reader = Zip::File.method('open') is_collection = true end - if is_collection # Find the biggest file in the archive biggest = '' @@ -55,17 +54,17 @@ def perform(genotype_id) end end - zipfile.extract(biggest,"#{Rails.root}/tmp/#{genotype.fs_filename}.csv") - system("mv #{Rails.root}/tmp/#{genotype.fs_filename}.csv #{Rails.root}/public/data/#{genotype.fs_filename}") - logger.info "copied file" + zipfile.extract(biggest, Rails.root.join('tmp', "#{genotype.fs_filename}.csv")) + system("mv #{Rails.root.join('tmp', "#{genotype.fs_filename}.csv")} #{Rails.root.join('public', 'data',genotype.fs_filename)}") + logger.info 'Copied file' end else - system("cp #{genotype.genotype.path} #{Rails.root}/public/data/#{genotype.fs_filename}") + system("cp #{genotype.genotype.path} #{Rails.root.join('public', 'data', genotype.fs_filename)}") end # now that they are unzipped, check if they're actually proper files file_is_ok = false - fh = File.open "#{Rails.root}/public/data/#{genotype.fs_filename}" + fh = File.open Rails.root.join('public', 'data', genotype.fs_filename) l = fh.readline() # some files, for some reason, start with the UTF-BOM-marker l = l.sub("\uFEFF","") From b5b38a0623595993c88cf937ed9ba18e9f357a7f Mon Sep 17 00:00:00 2001 From: Philipp Bayer Date: Tue, 27 Mar 2018 16:02:59 +0800 Subject: [PATCH 5/5] further hound satisfaction --- app/workers/preparsing.rb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/app/workers/preparsing.rb b/app/workers/preparsing.rb index b07acd48..dfbbfbe3 100644 --- a/app/workers/preparsing.rb +++ b/app/workers/preparsing.rb @@ -29,7 +29,7 @@ def perform(genotype_id) logger.info 'file is gz' is_collection = false when /gzip compressed data, last modified/ - reader = -> (zipfile) { Gem::Package::TarReader.new(Zlib::GzipReader.open(zipfile)) } + reader = ->(zipfile){ Gem::Package::TarReader.new(Zlib::GzipReader.open(zipfile)) } is_collection = true when /POSIX tar archive/ logger.info 'File is tar' @@ -55,11 +55,13 @@ def perform(genotype_id) end zipfile.extract(biggest, Rails.root.join('tmp', "#{genotype.fs_filename}.csv")) - system("mv #{Rails.root.join('tmp', "#{genotype.fs_filename}.csv")} #{Rails.root.join('public', 'data',genotype.fs_filename)}") + system("mv #{Rails.root.join('tmp', "#{genotype.fs_filename}.csv")} \ + #{Rails.root.join('public', 'data',genotype.fs_filename)}") logger.info 'Copied file' end else - system("cp #{genotype.genotype.path} #{Rails.root.join('public', 'data', genotype.fs_filename)}") + system("cp #{genotype.genotype.path} \ + #{Rails.root.join('public', 'data', genotype.fs_filename)}") end # now that they are unzipped, check if they're actually proper files