From da1a29ae66a6e72c527d6bd4df835f22a163a98b Mon Sep 17 00:00:00 2001 From: agius Date: Tue, 4 Apr 2023 13:48:17 -0700 Subject: [PATCH] Fix xlsx parsing of empty cells with no formatting I found a spreadsheet in the wild with a bunch of empty-value cells. ``` 65 158 ``` Because there was no `format` specified for the cell, it was not an empty cell, and there was no value, the default behavior was to treat the cell as a `number` type and try to parse the value (which was converted to empty-string). Using the kernel-level method `Integer()` , this threw an exception. The exception was also surprisingly hard to trace; it didn't have a full stacktrace in the wild and didn't appear in `$!` in IRB. Apple's "Numbers" program converts the cell to zero; Google Sheets leaves it blank. I'm not sure what Excel itself does with this kind of XML. This sheet was [produced by KendoUI](https://www.telerik.com/kendo-angular-ui/components/excel-export/), some angular framework with tables, so it's probably not very common. This change adds a nil check to the typecast, and returns `0` or `0.0` on empty-string values. I'm not 100% sure this is the right behavior, and I'm open to alternatives. Maybe we could check in `SheetDoc` and return an `Empty` cell type instead? Or add an option for "throw exception" vs "default to zero?" But it feels like since Numbers and Sheets can open these files, Roo probably shouldn't throw an exception here. --- lib/roo/excelx/cell/number.rb | 18 +++++++++++++++--- spec/lib/roo/excelx_spec.rb | 9 +++++++++ test/excelx/cell/test_number.rb | 5 +++++ test/files/empty_numeric_cell.xlsx | Bin 0 -> 5155 bytes 4 files changed, 29 insertions(+), 3 deletions(-) create mode 100644 test/files/empty_numeric_cell.xlsx diff --git a/lib/roo/excelx/cell/number.rb b/lib/roo/excelx/cell/number.rb index 5cd9b552..d37fcbce 100644 --- a/lib/roo/excelx/cell/number.rb +++ b/lib/roo/excelx/cell/number.rb @@ -20,14 +20,26 @@ def create_numeric(number) return number if Excelx::ERROR_VALUES.include?(number) case @format when /%/ - Float(number) + cast_float(number) when /\.0/ - Float(number) + cast_float(number) else - (number.include?('.') || (/\A[-+]?\d+E[-+]?\d+\z/i =~ number)) ? Float(number) : Integer(number, 10) + (number.include?('.') || (/\A[-+]?\d+E[-+]?\d+\z/i =~ number)) ? cast_float(number) : cast_int(number, 10) end end + def cast_float(number) + return 0.0 if number == '' + + Float(number) + end + + def cast_int(number, base = 10) + return 0 if number == '' + + Integer(number, base) + end + def formatted_value return @cell_value if Excelx::ERROR_VALUES.include?(@cell_value) diff --git a/spec/lib/roo/excelx_spec.rb b/spec/lib/roo/excelx_spec.rb index 10b0caf5..fa69c0c4 100755 --- a/spec/lib/roo/excelx_spec.rb +++ b/spec/lib/roo/excelx_spec.rb @@ -616,6 +616,15 @@ end end + describe 'empty value cells with no format' do + let(:path) { 'test/files/empty_numeric_cell.xlsx' } + + it 'returns 0 for empty numeric cells' do + expect(subject.excelx_value(3, 2)).to eq('') + expect(subject.cell(3, 2)).to eq(0) + end + end + describe 'opening a file with a chart sheet' do let(:path) { 'test/files/chart_sheet.xlsx' } it 'should not raise' do diff --git a/test/excelx/cell/test_number.rb b/test/excelx/cell/test_number.rb index ddcffeb4..8919b5f1 100644 --- a/test/excelx/cell/test_number.rb +++ b/test/excelx/cell/test_number.rb @@ -15,6 +15,11 @@ def test_integer assert_kind_of(Integer, cell.value) end + def test_blank + cell = Roo::Excelx::Cell::Number.new '', nil, ['0'], nil, nil, nil + assert_kind_of(Integer, cell.value) + end + def test_scientific_notation cell = Roo::Excelx::Cell::Number.new '1.2E-3', nil, ['0'], nil, nil, nil assert_kind_of(Float, cell.value) diff --git a/test/files/empty_numeric_cell.xlsx b/test/files/empty_numeric_cell.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..d5d9fdf94bf8e24af7d2ac4c85ead871effee114 GIT binary patch literal 5155 zcma)=c|4Tu_s55^W;b@(G6-45UfH)uc4f)VWFJBqOSU0f5j9H8c0z^T)jJ*LALQKId~T4HY031polR1K_9!+;Ir5&=AG|0BErQ z01)6J;I5n_%oPfAHPLZ*g1Q*-!X4~y>)-=`g?Fx4e)~Lt696!Z_dXae_U&agikKF=k`@cLT_tnLxu-=lOlfpE=a8e+RyR^ zDBs!5wmRS$))wg2bv0Jdhu~uLim!LQb-~p*;_%b8H;*2dTwCc~w;E^!Xq?-DOvg0l zGW332M#q!@y#prBPm>jLng%=^WtUCc0S{hr z@`-iF!(^qZo5{@)zLJV=^WwRTR|ztH4_MNgt~ZUYDOjSjck+D~;;JqTgeT&7&am zk2frngeDr1k@D&G>BUFS9VJnxuxYwvp&5j;2#0+_G*+RL7D`*Kv1y8mK02VQXnst; ztD6!NMXg@Pv@~#IsUQEtcqEdm<64GL@C8B=6%`nmG?O0@fh$~8fLKKyW5G4 z8f~#`qOyxi{D%5m7%$Zf2}j`w_@H?Dqg)*Z#Y7t>Cq?`Xf9^RgF~P_|X$@=aFS1G3 zSy&foXx9gxdH8kF@jSU%0AFihzlB7GRiT>oZqXU3K0Yq*Sl)E*{jgon#OR_@<+jV? zdBPNu6;mpeE9~PHwr^l~3oJ+?c-%!AWga>nl+#|k6lC^!h!zqbabYFb4;)8t=>5=& zuzWlMH{>mQ$N+`fkxVhZ)^c!^1{c8=!z$)FL9xR0<0qTQZ_ll*1-Y;s99HZix!B3&McTCxJXo{El?ow)Ec7AqU zeqw*2(Hs`{sA8r2{PffN!R>Of=_uO6DHlnUZl=0KL4BN(0``FPGHx^L#W4RZ3mi$G zCK#h{j?gT2J{AY97zq2tR$T>XEAysog%!XMn)<$0jajD}Ovq{SJ|GE+h$OVx=6tF# z9G%)KQhEV~m|+xsv@kQD&_N(8l1VD@?8S2bFFn& z7enGdjjc`>IhkXUW1Br;dfM@60eJz=C5x9oMfb=-vM zW8lB(^R15*<*urhqg9LxokFa?q|Xmc`!|WstJ~LK`%1)s$^~?Zdo>Sg!ff%!GH@O0i%RyP z=G?707(36p4R}~8YOy}>yf`T}npO`bRx=_(eVGd!#Mr3OV}}sNfSpdp8rq_*sN{`8 zUWV>~&c_4S+y;S-js-4S-u^BpOJ_4T8<>>~-`_X?meX&VYx1BGTIUFH{{np`zCC4Q zKm(GaFdkInC!S%0(MLV)ASM$>`{1|!Oc4o*JcC=m)}(o%`<05 zU(3VqSAKY>zhFgSOByfH^L2F!`|L^!FcCPneWE%NP$_(2PbT@|ff!2wmH7eo-o2T^ zJltYU35_F+4j+HPP|P5aIIikdPLz`Du{gfi3Ej?QmsVrXuu0b>MJEP>dfSy6IghTO zQg-7;BvjFD-LJ|T$l%x8RW0^)ZnBiCTjbScZu?T>siMB}OI2r7>RXzXT`Q{}Mu)W+ zZC8kXq?wDWyFJw9A8GncEMMugz$TbUkNr=8voG4~Li5?n7#3!8pY28#z31*IQN-xL z6nS;H!w^0>Ijm?)ktaFYm-MlL_MX!|zk!@_773l9LRFGK`6n#bF$S&!i|GlkG_d^o zHCS_ZsG7iF(1AAXtmV859~9g*Bdc&o6Xe<9znryT21K|nYtHaIwLSn6AfD9|*dT@N zbtzI37RKK8IZ44BYpQ2U4d8gGvx>gZ)EKI*#7|EqA-s3kJyo|Auf-icn{z@`0H4+l zD|XuShknSyqy~~Xxt6BaVdwV#l~nJHSkjr1DXpb2Z3g*RbB_tNAAv{jbGyuBj7)&7 z`zEZR}HT04<9_UrLPZT!$m38Ek86 zPzFZcRqdX@`c)%7uPVfzLA@0Ds=XD8p?-SSsts->TDMk+7)cGt{_O2r=`cMFvvUQ8 zI3FxP;bF2J{dq?T&&O`io*=gyFqdcZ*SdYZxRe5bfpv?V*=!QVWa2QA*YONH!R0Xx z@3?p%0>EK2tI6D^Ro1+WCt5d2wnCy`%BqlGRc&h!SP6M(8*K=-)#Dp!X}`D%nW;HS z?wOCAk>#;ir8L*n?!=_1(Fnpgeo8JHVj$~yxEjh|Y346CB$`t5p?3kEm#j&<-R}9Gj%Wj?r&jQ< z-2UgQ+=&2cJDR-JTSVouoA^|6zBYx79DaqMV<}OUmcR+KC>{Reyih3>wvWEq31h4G zXfNBnLiJA^RL{nJ=>ffvEt&>io{1tC~IqI~TOY>xc z7etdSD}o}_hS{%+gf*<@ONO%Yi68r>Dfq8GF*jbMG<-^bRq>*MjTDzqTx%WN=SlHl znfAueb8F`9?z^9l6YJA@7<3n&PCe)i-0^w__mtz$ToU(slSRG z_OcLN-Y~GN=iVvYQy=dx+yANP;%Hm6SI~XG1``0F`yt3bn*CWb|HpP;6fI+m9wQad zjy@PQQijyJ8S-59vgLR-F6q#yKq(uG>e#_*vbKTfw6 z?%YsvEDwSfbhLu z%`48SYCO^YChj5y{%z96R(d?$kv5Ds=Q9=$1zDcRRB-bB0=KyE=cf3V-Tq51z_= z8^0CQn;8mG+Bro!X+00wq;upn7CrTNT zAaRA7h-OyAElQnHa%!3}r8)SLo0ug#2{rj>ATCFGcJ>uUkKy}vMHLckF$#~}?sUkGDywJOloVv6MgyW+D%{O?N-zZPA0;j?xZ!d@#E1^ZG$ATh!eX8lK3`Xe zO$PQM90D(M#i$2Hk9$e1>iYtUQrajrR4_0ZfxkP1pg{)&06<3skm>jB?tdOhVE;Ih z_~|&}XYl#i`YhNV%|AD3|2N?Isr4+-9bEvw%&>ok{yKV{h2o=a&fn1gF^Hk*-v+Ti z#NfO$KTB7_`I-J3pZ@1%eDh}KE$`U}>EV8V&~FUzf1{swerM?~@xIf~`oRB&JZ~7z zB6}}