From 829a86dd6df535b99a6e58d3dc3f0cc1d7c10cce Mon Sep 17 00:00:00 2001 From: sunveil Date: Wed, 20 Nov 2024 15:29:49 +0800 Subject: [PATCH 1/8] Add tabby cell bbox information --- .../tabbypdf/jars/ispras_tbl_extr.jar | Bin 164506 -> 164553 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar index 2b3c916fa11d5468b0e1ad78ec2cd0d938c7d843..8a737c79cc911778c955682ca553cd68e68059a0 100644 GIT binary patch delta 10750 zcmZ8{bzD?U6gGQFLAtwKS;ta~u6J|^*ujx?D;xd#- zwco}9o?^f{O?CGT;)fpReJhaEsq1pX_b)3bb`a`?>-46PJ1Q%4Qp-sPDI6eCKUq=` z@`}~XeOhoJ(v?6g`ec_ZM$h;#&2R9Ftx#*O23FARAd^0uecoxG9Sqw__tZhxjvzS&WSHFOpC5SBynUCTOXvbF7m{7H>Xt1}794i%o96FVlaxr@ zt13m}J4b_K&AB|dAY4~CR$ZpaFYp8bD(adh z-Y4}Pwqo{08)$xXwTn61G~-`*VpBjRye+<2T&qWvQ((s)A;bE`*H=!=z4D7i*ma=# z&|F&eJKZ|5;#ENWE&xhuIDzAjX^1yS-1V6HcY2?|I!!azuenu@rW2aLExAU#T4d)@ zUUc7S+Fl2wl2r0U0+KMZa-W-G{=(rKDelY?*eb)|rsjGhtDb9-9dumPJ2T~LcY$vC z&zfVq8*6%QoFxKkupkysDlJNK!O?NYj)QYDlJp0To-)5_qZ$)&Qz-q6=Bl|kBIeRy<#PhQ|)$|lIWL=YP;Wf3bCZ_5{Q#*|~pRP35 z9+zn-*>UI(x-we}tF(DAsc$Rp&#ZEn!00?y+>B|GP2GJCOsN}|3~%OwBlR~m(|o7; zzV3!f2cBGrTm%R;a*=qt*RCxkiXEi9JiT0kitf#e9kjD(67~!)6II1Ov7=h^YM>~K zcjBzf<7yOsULke>zh>Lg+?a`VD*h5^%n@}6p~hXIuTLmay>tz&$c+9PJZ-@E1g7P( ztIkcFN;7$!UP`n)#4lcW$B=uI({_~@$_KA9Z#z|T5r2lA%HR$aa^wO^U2#p zv||aUfjAb9NkmEvSN$uy=p@o^;xX%8GSV*Y4U_3ar-mf(RWDD{Gk*-1iZCIxW6TER zctHZMyrikM9UPH6tQCJL6AcI17FZw65-;Z~+Le8z1;laXmcaynW0(pfDg1$M1S1;!UQDnQ#L-^r)Hg;s?5T zNU2c$Fm-ctkZnznR}He#`;!W!#I{pB#a(QDU82HQ*UfpeQXTCanhNFiFtqW((`tSC zgR~LTE4RnGY=V)FDZhdLi zWkga6f2!rBn%8R$r6{lDOSS}lxHs|}RPg!4XqI5=&S-@@-3)fu>J9MAm5xGL4QALn zMZO06GQJ5XvzzW%7GRKJ^e-cr#|$0a?_}vFcw~6)SJ_hOCmUT#adRo}v?Gw&<3*@F zZudH;tIodwF23ROsZ6j3Mh^E~DDBg=_Y-O^kLnf>^{|x{6if8a8rgGf&;B%iN!5c% zp!KEQ&n1|U)L3_m&_8o+>`4(SzhcQp(Z`#E0t<+Ds*U1Z&66whF2^2qU6q!6JYP9; zV?QD3k8|d56X9R&1Q164qAEoDyJ`%5j(sPaV`Q4Nh5Wo_C#Z@(Lg(>Yr#0MQw$5YSIM{LlWz0uZ991*J`+C^M>J) z;0b-mCXL6q!Qby_{qw^!m5~H@pul~>Qh#XiMG?;`pS7@<(^F_9l?ZD*@yesB69w#9 zob2weK|;IXIA~8DF?wonhiwLfCVVOdm%_{=VULWvOd5#b-l)aa*v0J0>Z`ZZ9Yf1# zO>f;f+ht5Y+>XBK%t?lCvox7pZLswE%9_i&Zj+qP5&tML+ZH+>7wg$gsJ@!wcYLWz zyGWmc;zb{x2(RbDU%12eVg7p{9}?r~=-8f_SR z?E!KcqZl(U5ES-B47C+3BZvZ)wcMFrjHo<`+4&wh`uJV35k4y!Sz0*PvX7$q~xucxN_tfb@CB;k9cX3dMOaNb%K%P zIUPr}!Tgr0s4gkdrQAC487O6G%w%4*;B>&^YYNisbW)16(=dOE#^m4>{8B9U#S_eTUDeaQJyS?hUK_Mcb~MN2O!l%Gc86Y)DzVBjE77to;47)n<4W>M z$T={g2y&|Z32~e?6rL@Gfo{IbPnxwa)(T&!PMM!KVa+ol zNMElgxV@fZ0~6CQnOBY&0(#F_1^V|SvJVwBez(TtQ=&$0N^t2L=}Ngv z@4@wR+_zL=mc!MxKaZNA%`xNkh$FG!P@UwK?rZY}?!B}I{3eB(vOW(0bd9k~ji zJEy*oPXZy>u4i#7J}=nHMHw^7_rj3%$7X3rVU9s-$5CuqXI|i;qQ*EzykNxSh1T9?A`@?)JsI2$6zy8hSzwPS1P)EVKu; ziBsT)l&sJnO1PcYFWY!hBnMS>eOSmkPQSo$yYOg-FWelZT*ykShGUL|Sata>7#+1- z&{-bcQHi(YttcKIR*#2bG>6gKhlf|s3X}0%DcK}fq4C@dcUvy{wX9?%8iiwdMSJ3( z-=wYhFY>i4n?=DI!lqW-7rBCZF9a5T?{|cL6ZUSqpvSq6NW2KoTUP#7b!7LQmiChI zG8RTBb%xum^{4arE$ndw%PF=Namm;!QGs~y7cNJwT$RA%6oO)OR=L1&$!yBqx>_iq zp8T&u1B7F)%k`4)HM574R8GM}WO8t-mJ!0QW%v_tev1N|k-K0>mi&@Xf3o0n9?SJr zI`q)6DVdpaZ-8_wVqNnYyz= z<2C36`xYCl`p0--j)K;y>eRQ!@@`QsZDK}@ymq;T0>BQ8KOj4~~TAO3yh zCz8P|imnfi&HI_!Bdwq<4reQQ@yH5RDUZH3knBh>Oxf$iDnKumV)QUF+R|{OqV+6} z;xg8fdG)W=ItyBa0Q3@NcFVn|Mg9;WO9n&JRzt_IB#Om8$45dsWrX8DqN>#m%%Xf~ zA3*>@I5M2@p{GO+cjtK!VmolCAB4Ci5u|xl! z;KA_lu*YQq&w`}yLF-@9i|oI-p^lw_QbFgz45rL9m@sX8VvaSPJ~b)Xb@;KjQrMAoIc zvfYUqA0jek`H3@&bjgQEh(3?jJ^k>SrPvPrU1UERzslpW&QBYq`t52h(JQgXOs*Sc z`pkj|6?GVxNs+oy-6Kqr7>%Pc38cE16fqU^a&h{Te$?-6nywDA)s0t5+F51HKyA$8;z9V%#GVmr#%XK}WJ&F_dt5`?;AGyS%>PA63m*SxfEbIt22y6^j1& zA@vOxOS~&dOukV15a~kU-kZ;pWFA$Bt`SORt6N0YUG1(O$YdMjCS7cLDpjT%xp_r! zZGZdTQWZ^u!%w?N{*|*~qe&YY!|UA9gdhOn;#Tt&|I3+VkzY!s-d0Ge$9qa0E}s(X zUKpXNkBYS0kGZ{NVp_dz$7_*aWbmChqWXFFD5)3x&Pp)!;@{rhOI-_PWA1+9*_}zk z$D&TpWqTDylLP;-r5?N&w_%Ny=iYxQ)VsxY_x zW3;Zfy|4MBzLkyAD0IFaK_y|?mKs96<2bC{gDxDtm&AI;MdZOR zTN744(9|DxS+Ba7f4Cwh_vTE2z!KIL6_tjyO;*g)HXK34F5N0u#eNo^dKY(R`ldgZ zdD+y>ifK?7@A5seL)y?`#f(1glS6d&KW|L>$gzDyu|^lil8LUxQ}1%`Aeu9({8X5` zVKF!^2xxc^ZC=;=KaIxxZ<1k!Fx{;@gOHrNC?1Mt6K&dh6fWOSxE@Z2R&QZLEzP*2 zlxLn)Cx>WT4j~DQG0Axd80%S{`un^OKb)T4td85w2?QZH1eVz`+^})>=V!!ypk5dr z4A#$% zi?`D7r=iUZM+)=oMe8?*4x%4nTu!O|$UrphQK7V^+n*z$2zWW8`lB5n1)0^&?mwXE-MwA;hoKNRMGK|W%qJ89q zm7Uy#zl=h>I32JZf(4p=P2pz!bFgYH$}8_ z4SEXEe`qI-MF|tnNw1kM?~-TqNFtmE#G4J=J~vGi3m&p(_>p6x;|9GIwoOVCMfaxA zb%lo=-g@ZrQdN<^`SbpL(wtzF2n%R8@^O8iTb6xL$1=yKQOYx>HVLvA6+=W~o)Rd# zW1gBxo#6jFw0H1uTGYQi0T!8^#yP6^jHc=S-<%J%7msYk0ooi(3EzZM>tSPdl4ORd z#pknQ1(qqz2sSzQsPy8kXc5KG!N-4wHPMdO?Q(vbGKTi8`K8Q0T1pXpVy$6UFT2pt zW+B9HHgr)>GW)s#O=eQmbH037F=Q_%Zs21bsuL!=?Nr=j{ThY?%{W*O%aF16i&qyB z9;Ui{mtemV+_hvk*Y9e<=Cx&jEy=Y0kF9yVWVKUYF)Vi>BW~pckoc^}u!Q|<4$2oo zx;A&a9h+i7wu+;Hm%WGYA}LOqwp*@6rod?{`}OX%lWQJ5ONVc#O+*OQDZyzYFVn-C zO(mlbm7pTI(2mzvc30Ta_RPPAc=Xx}h_#(~5vr3e0 zM%xN=F?N4;yzVJisIGrv)Hx6*$I(dtgM9U^CVYg%(JsF~XP)<1v1RDaOwm(rs*Q`0 z@sw+auc(f$+4phk4>hBD3|~r%MmA5~&g`eX4pb)Z4%K44CG)djDo~-Pvad@L?d{DQ zXzir(4d152Gi=7MwVOk(=9@+GqmLF0mV6Y}KFqBBYBbD~^8d|YKjUlRKWp)3U>fso zYfz@>lW} zn%)w-Pd=*bS>HZikYc^SK>reMiZLh>xeFV+JCkgd{>d=D$Ytu5jVwWyOq-rtBM$u| zlj=!-k^g=1J+$U%rg#n^#YHhs!<(M#@|0wpFkFW#!`>rx1{UG1+f|}hH84zxTL=c` zG8~|zO6f}w9h0{#ljLH0WWk~MmZtGBrI9XMq-RW)h&wDT#j{2IVd+Cz?oF%BwF}E8 z(+CjZHh5E%eL<4TK{aBKv>BYVX*zp48&v5l!Q(B#<0~O7nrlWql2gBFwY!dfxXJ1@K$_IN{l-A+=yu1~_%q z6|7?ktcuQjZf?2Y{M|Y_=aY#>B4PWH8(xN50{wjT80Nlfc=YwdtW|~n4ec%Gkh7Gc zxYoF`<{0o(;w)yuoI?AOJ`z9(lW?k3*(@R5!jZeyfaA%|Zx)dB7EwBQ zoKb@3WnqVi)DHO%az#0|-1{VTe#Cw#VbV?scX`97$-ZyWx8+m^e>1SPCXZ`#7U9wb z7y3cofz0X=@4`M$u)ShsKs`5A(eA?(VJ)XL@CH zX^ncDuM>89rI(dgtjM+LY=yr+sK7zjTM6krSA~siX8P*o!d-Hi^HofYGaK~Wh?&Vw zn`g>K>^M-&R=+n?JZG(QtCJt`ZE2~h`thMtDNL6<4NV=d%ApPAh323a>c_xHDpm|W zy+J-L*2Z_!zfJeVT=tuO)bEM}NUnc#)nU0j!_wj2Xrv3+lvOAAi*uM{OwZ1|Vm>wK zx7i09q^F}9hG*nv51zgEDdN+Ms&+9lvj}#xKsTG(tgaL5{pq6Lol-%OFyyOY4f4Mb zUe{O(d(P>Fx*ZTnHGrjy^(Ly4&NYXwVIB1=MicE=j^WBa-AgSInFB8fegV!T2Pilh zbCZJQdZ09S*w>kIbXiUW^CO2`hz+g zcVOtbU|Y%TSm?K(GpAv$BO+{1O3S12h7nItbMjZocQSG%U%g&KS)!?rvWb|g?_b${MFBKI`%g+8(C^6plS)n+Y*sfv_8QB!|CXF{Z>GF?NmCB||@-yQWwsl(@e|@7~cT=>7=E`R& zCNJ%1t4Z2&G0_@U;qB6;{`hR5i$FC;m730%uuPC%`RgYW^SctI-0#U!!ztcp_UPvZ zIJutBhZDBN;-A<48mwKS!0>L0$%e&-pq2+u_H>#+X*{~>oI}E@4f2r9k%keXvYIU4 z9py3X4X!Qe+D%N#58}{_C-*_<&wK?c;nL5Wh*IilWowr&lblt(!n+>s#+oE|C z3tE?^lZ$Im<`{yRY3d?I$m~l-5Wd`z*;gEhwMiDM>K2&aH}B4neIG( zG6m1o+4OAGrrPkJoZJ-_P*Ac?7mSE?R|#BFu9#1=)=(z7(ik!`)Fyp*C^5-OB&8K` zwtT2pA=gXi*P^C)NY9?ZVLg^Qo;v)-lQSr}Ym8p{#Ua7gNUF{%P<&`E{1> z*(!s=L1IOcguV|AyF+R8Ot}oLi`u$PV(<9+&{~JlYHhJVt^GPo?&I+UD<3((nTX=M zs&6aYhkaO81|eFbV?V)#|CZ_ARRYOE=iY8bd zT%=~scnuSu8xxAV*8Q;9ulGwg17mEO&?he=CKi=-f@zf}C71&cP^wXVJE17#_A7no zbUB8k;YqcZU$h)yY5V~~Inz8@8K|{+M0zV9>T#y1)YgMUuNF(ze^kqPrbF+Zlh6;P zN1RCra-5xn@=ezr>c(@~?~3urUqsLVf53nJhx`Y5z&!pF0TC z1H|+fqG7z+X;)B!HdvAS*ZnyZ~0lz03(lL;@J#UwZ%t{OSx)0JV+xs2mvT0^os9p!Yyj z^F2TUnm_?T$V)I83dqvl&ufD8e|@O_FBu8x6L<&(oskj@LRzOTp&FTesozW(P!3SM{tia^25Kjj}g z40y#2KIra!&n4~dAq!aP4X7a}Lm(i;^L{n&AQ0;RrjqgjD3Hm~An@ZWV$jP6P)7cU z`JXc41E?dA z1Ee8tV6q<|1)xiUBmYzt5Z51&fh>U9{s8?$)cua7g!~5I_yfYo)QS)gOA*2d?)w9e zAFL1i5&0QN9&m5wd6H9RgOXL)gHNf&Vk}Kemu^a3T=kM;_IHfZsGABp_W7z<$4x)Pevd zWIhWBNM`{d20#7-bHRxqKpJ5728n|KRmc=*9Sq1qOu&Ns0T~E`fJdPa+W$5Yamf8w zhZ6zvh5!tl_YGW=|8o)`A>sW$n_K-k1j*gg-ocGW+s4TO#D@YTVCuhc@^@f$2q1$^ z2mfCV`ZiAg*@;3QR>~6!Xh2%Q@K8Vjk^y#v-m|d;QNr%oyvv7x&-3qzTZG*cj{;-D zfG4#7c>JG}1_|l^eaN1I!(sOi*$qe*4lq0vUNZc?JfWaTI3Nls0^#BJvowRh!T~ve z=meyS0OTMVpj8B*0+8N;A0q(9hoF%NfRc*#F9bG)Ol{2^>;^@5uAK z@Av-BA7}2lXI9LbHFNE~u00Thk`{!5rXr8{2p8eWlP3tDlg+BoP9TWzwgv()0S|!f z(Td?;kbr4QlaPdunef__3hJS*k$k7jS|WE+tw9$+fGLUpS}Tx&;b~MgZznBffW#{_ z_7ZBwV)dR3zjTf~im(>9@<^Q5q$@BWXoxvj;j^@PX0)lL8Rq43bnA*%QvHglQWJWH zwUSTNX}nK)aQh#>qb0|0{vOW#2#flvJ8Y&RS|($Rsr*ppBTdzyHgAE>l^l1XF=cLX zLXcF{+<_gJWo_EHLgGbIj+-F?ZJM>%&@} zHZh!TAA{ovC4bT4Jay~;h?BG?{lb5|_?J;+a226+|M8)!xPdP8n~jVMoi2^@Cytrg zSRM_dnCDDEUCA28ffz;BDWP0XbS|hvVjE^j$b^{Oc$r$cQwA(rqxh7&-cyWDIPex? z_GYKtMOKfp%~VzW@fhe+Q=01*QO6lQfT~$j6O|lr<{gywqvSE0vB^j&I3EHc1GrP_ zovc*Zg=?f4L&e%qlMlYK*L)Vak1nuJXYuM?>P}_WjDkt|&U8&Cml?xHXI>z$(Bo>& zWn?K3TeA{m-?8SV&a1M|P;JR+HLK11B5ueg`jdOgM5L1OQp>b+Jxeod z#JejOb=8LjmMhjx&0+@4h=n|3rXhXKa$BfndNHz*DtO!iZ7i1UM$+m^8gzS8+N&TH zADn8Kk|;RviBi)<2lf-96^!cJDAV0z@%&MYxlv35Ny&{&Ly6ok)p;We3o+)mVU5(6 zN)khsk*>Ti9fr63YOz`~KP7KksAId(8A(D@FZi5QgSiMeT&^%@V_(tF;&O0k2ykJ& z!!oUN8jc>-hQ8tD@o41~`&t}NtZMig*^RJ_y0v8bMfxYRmoZm`Vd;!r#eT!P4T2iR zb&IanPv~xKEuR^(vYoGSqJbOX_TDx81n*wuejN{WtvU_?FWJiaLM&YJDBf0`pHxKS zGjbT|<*n(IZ@i4dQpdrNZq^vKrZt?E@35mBK9W96!+|Q%i`u>CE)NSHaT9R=5S;E9 z;cAzFIo(vVxftbiRCqU*c}_Ok7KLUPBeC#9QNSZ2_~quRo`{ECuooEIQk7)XH6{7% z6sp*+ zeTGTE&TI|c+)O>xKhgL!K|@TAmn7cm=?A8U2h$LZNL-l^lY@(w(ub|*8?qM;)I`bs zN8i+lvy%J0=&FrGvrKDU_|n5hx3%5}F_O0M2(eeh0=h%oq8Bg5ei#UO^RzMu39P3` z6vguPn@3QvI}5zfrb#9L{;O1PNn`=)*V;>8Rxu2X;hRHT3`Y8ezkIPqhISlo5ajz| zDC8UU32$eg66wURtS6)$z&>>$)yesW^rO%e4DLTgv0xFOlAdU43-JMqz4M__eH^EzYx8f zc;Rn@Qi?SAws<^WXMWL-cT#M%NQe57R(Y&L!`_Er;bf5?b=zinNx~8$JT<>mk2u;L zd^8WtZbfIsYg4z_oa|R_!hig&`f*_yCr1dGBqH|&!xQXbRcw-(NkL0K zaB%U4s_5*bth({72+3wmk20N47FGgJ|0_gk+M-d-VW&*bbX_J&>gk(7W#6}bfACOo ztW~R3D#k5Jzfg8_H+gcg#<~)!&39FCk4kUwl;SR-+?4Bvj6{AjEfmT_Do)G&vE|baOPn>=au2g0T|cx%ilor zaF)~1E%kj|#I|L-JKji$vTiwJ4_N4O+kInsriP%0VGuHl444Ad}Cr) zepLBt0&J(DiiB}Gr^C-@IDM1stlC`A1_bfr35&W79?Ks;HOyCw>=*>X;o+b7v=kW28dKg;Uf$qZIoQIVtR8C<; z(}VbU)cVD>q2I(pI#AQ1OOshTa~0Z>73-`s@@E*unT%PsoZwx;L8@kx-4oNg6&FMS z^b{(~fw=e+m`OpDWUUsnq_1SZ(g3b^(NN?-ktB9(NyHQhWvmgV0;9CE4u`S8JTbqd z=eK#QD4{Oq{#{+qE6Qve4UN)%D3|W@$=#GW(EoFrFeE@Ydv~dQ0L%A> z+|lyy*g|pw+yG_D-x!RN^RnCq^);JBVINaD6Qt-H6c58*fQ_IB$8eh9@)K=F#JWuMFhp(v~FVB4?1 zAEQwEYMJ+5S@dA&W^X&1fMYVWhj`A#EkY9~MmnC$M2e}P>)7Lmv^}Zi_eImP|80eLdpq?4^VvK?*>si}s*@2A;aJF7*rm#4qEKTS8A+?j#uIPRG+$d5*S%8}Q0u=_ zeI_NprCs+&w-wFgy6+O&M)haJ`^l*qKW$Xgxd~J6p`*!T z?`p3R$)LxV3i7#7pZ>V@@UT z7SJ<#w1m>nvjV9Uh{CKcI&ljXUZa_OhU~0UEA%-@EinyunWjWR&r|Y*IKh*b@e$oT zF02b$kw&#j*SS=$UlJ63cXeqy9f_*n`O}zYmves5G!!e%H=%e?gr+c?VtlePoT!WI z(<-fU(0vW#QF0^;9t$yjxADGtx=Ymdebaf0J;buZ#$z7X^sup*n!l@PfR+i3cD&^E zpljLt$z5ShOrkdi?M|O)ObSW&^k{ij!W0tXe$>so-r%Itd*s_V)}ypil1yH4y7$$0 zPO|uP-;eo4#bct9{)6(ty1HT%{v!16h@JG|O3aA~!b1z^i`;%+^wbS+xH!G-@`Sy` z%q-384a|O*6U7+(@L=~j((kW(p1@-^ulH$w_;=7vcs_zMI<(pL=RWVbqVgQpS@-I1 z%Re43#asWVNyaL+#L}Owt}I~tB^Q9Z`8)Y15%p0CXL6@UlF{DP4)koDp2MgCOgw8E_-7I&Cp7%YB+`=_ql$4dEPXjAlX!2bQI zHgR^*Nzm~9N){AqI)<@umNz3L@gA$+_UV(jzUP~Z<>d_PTDXObv4wx4c-N59RW>na z7Cz>@p0mLbM_)6P{YtEz_EkWjzjjvsR6dYt)oN9Hbd)%Upk!0{##tEild-Qq?J>V_ zPN+0F1<9ugM{(kWLNVX*nWLPj$L6^sk8v^+(Vi2hp*++W(Eyz$c6g|2+Y}qrzQ2F8I^d?`+2Z594>r-i-@orxoc+*T5$N2@tGAz_RR~t4 z!w&etsv_6{6yDT4blU&^J7&mF$uw*TrC3&iiHXnZ*I5vwox=>uL2b#Kb$wwqm`dpA zL*4q$vFg*?crmM>5T0tMtTepc z0{lX|I+`3SEty%EPcGLs%G7zhbW5}n7IbcUKW!o?1eN8{Pjciy_ZN={ev*@Xh`2Gt zOy?xD`5DxS$%^%nrDjkjlr4GQ$$W;XCRE=TS7)K7vzdvc?8K(nO(y}9&eW_lbyl;k z=!1rSp10(5)tI0==`F-bZDd1tM+o=Sw<0gv{>tm?+RhnSgTH;<83!`z^~z(y#T;mD zE*4+T({u+&z46nm9#zr~&ej;z6Zx-CMR)vGnU4UcxdVWZ`xz@Y`Dbo4th5MTzxxI? z!Z+bYbfQsjPeyR_n7@l%(kR7HiFtpQwWpG$=*W-xF4^8`b%SE9*R<=lr?p?&QBYCR zIIIyeq))Jc@)mg8T+yphf!6rL6}bZHC>(07uWXoU%~#=k*m+*@p|SHEeWnF{O=>oz zao`+@w=sIv>=I#dea5luH@A(8a@TWrGTs&v$MhDWh8B~#^`kDbBj#&O-I$iem<_v2 zI~&*baN8sIs-;`4jeY%HM$qZFyoVduR1=Wq~Kbt?9q&-I141M8e$L z*w;gr8yxz7tF`Xj;w@j^p^KhE@32HK{K7Dt^sd5S&lfEtKlL%ZN^fFiH6BANSZM5i zGI5LhX~7cJ#3*6kruoBL#JQiP`7fJji2TR)2rxyWx~9ToJQDl#hlE;t^L#E~F3k}~ zQf%Jud2F}ZOb-y{PYo|-SZkx`_77TiEMA_Wxf~Ztx?r3IEgnyBH-lpgP`7sJX*q+Z zzs<=HSEp4kRC$*AY6A|6j3V*SJ}kyeO2QC?Lvvapvhp%p`Br@C7GRh$D2w~^@gaLy zxXrq|u|Hc*^`!M&VZ~aTcKPfUAiOCY^BD88s9@KNg7FIn=v@@aDx*+%U!vSVgnN8n zTq-J=57F!`Uy*GJD=0&`vQdMU*>{PO*MpC4BeS;Rkc+%TjoN0MXLpW(8mck7enc0#wegC=h}j6QluW6Xr@-lcj#m!v;%xg70!Up>YA zQ-5$l9JPfsFNlcPKBP&Z_yu2PKp@b_YOGpxt+R6VhPZ;fHgESM9pXFV{@Gs_tBLIg zg~ro*D^K|frqwLj+ltd%Ki&Sh7x9WR7Ypv{o0F~N+CuY+SPr-|SCL14A{s)2(}#tC zkWPbu@Sjpq0?AYCngI87l83?&1i(q)Ou%`l7Re;ovOj>vE_AX7Ft8$_@E0^-vOtar z9}pl3L4EWOCkrY32Q?`|GCBWppte7R4yrfpm_>7VG+j|)tDVd;P#>8P#bmmPHQzX9P_? z(tKVzX>{|T%ygLNk6Jagx~bp-6;~N}rGt&S zx2}Jt&a}b0{GsYfB2eH=qck={h96oguaqQ*nzBI>^DXcX1S5qw0fp=icWitkzccVrmQwVoBTB@9qtcWixQaQBP`p)7ztP`{$6y5Ss|>r zYD+~Wo42dV0?QU(eVhbU&P`DzLdryHq|pE;GfV=gjq?(y)_s)3_bk64ir6vZ^HAiu zDU)r1pO_?0GH!GQ5iEbPf*RT^{$0HHl_I}jE=4tC7ZDDREsKRp0Tpp@`X;v6CsBl> zB&MkE4BtXrOr$-!QW6>Vw@S*?i8mNU3+sQbea~Mj%B+>oU1HK#oPCm(-D|MqAoNV0 zTZG%~nG42w^o5Ik_W%>};#*l6gR%+oWOY(`cCQBo7r6YU@fEqq^K6{Z;5hm!q?`s zj-Gt`btS@5H{+{bz+ZXRb6zqBX-j#Q_GZ^#4*BUai}R#E;zu;$3Zg3h*6HsSTzM<& z?s$_}%?7z7>dkR>;`RLo&_Enzt1znDSSpj6bax+x0^Qw>S0?J!L-EyzX3Vgsi(-?% z7plrb5)hFuc$4gkUK)}rwWb~mpb|Q;Jo7!9$?qgW$-1{#>)wGaEYd@@Jn zLsvt`zHf|5uDQogFKYkdleJ_0J%+khVnE5OW&f|UzigRkuKN;V%3uOnFe4{#n>1}` z{saX*he#K{**1Z?V4TF|c8bxr!auM_x2fXq)c0h)P%dQ!)n8iF2E*pm_j&Wl^4c0I z*s&)a@H_d6V;X$43!s(1yc`>GpJ%A=jqMT5;YTVeT03Uca?%_L$J5>P-mNTwxjuG(6RYOLpJuo{ z#N?a6QSuQ-x3Dm(8^sqjY~fj~LiT6QN>1{xpC{aRTNtDr;>n@P9VjVZ<~XUgy44{z z$2|q7lVz|p@s+oGmAjQlB{teXq2y)oYrOz}WHQz5xdZa4!WMoHQI)-rNV#YW(V`lJ zTh?2d+;+9S<2?$)d{&o`27IFjX=DQj+woUSw{L} zcCR3*#J~>2`|nwuM{z?pA%@RMll}EfB4c*uVf+xgntq^_ zQ#t2;xn8wUAGWtS4FTIw)%RecrV2sOG_AzJ(k|#$4}@-gN-mdd;^^CFCdrttX~%#@ znKXVBPBHAr?IcV~C7#`w(4^f8AZ$+IU?rc4A9=(FBp7w5)lKt-UP{EqBH+_J?)TWp zuq7Pg>~z>SP1YJm-OjgzKW+2WuKF3Wl%QF9veY<>Tc@~Y&aYpm29sqe5x?En%feG$ zHh8S4n+f$Le1vY|5{}A4Dk|i>afnqhZoPc5C^l~&~arx(Z{vX1vCD{-A zPR;xq#15<lqES)GZxJ2sKEOg0JH|J(?ARzdcMLEH{0HE=hiQL<&^$?d4p{yb8c3vg!F)PJ>SsqiAkAHj@KW^c2=(6;IZH26G-;~}gKoKcwSy(_XnPie^V%urk z>B_^?z}m*MG-E+F)D7i{!$Jz_a6Ntjp`Yi;6)R)0NKxwOL|@g{#k<&~pne!fX-I5FZ=fP06~>iqhqz&dJtnB!4hvvuB)kjUR+10h| z{alTr>q~w3+0*zlo`s(@XzXfRE$Z+*`2ggI2&+mV{Psx7gzF?{nZPXjT426%6S0FL z-Gqb2=v&>M*iuTQBCXsvMQls$N+gh85_(UXeMOihxQ?$uiis@$P3(n8XQZx%Izqpk zzoYakA5|YYEceoNHb<5X`Vt4&E1ULZLCUcwh_!7xnEm{br=q45zGHSVbpy5TW%!wf z$Qa*@f14K`O6iEIZ2jo>-96^BEFErd;P9@Oj*yV*e%!5g7$bG=p)4U!OQ_pZ{|=gg zd$PCbJglcv|Uv>W!gK(=KnpHgl_5;Z3r^+1J>`VX#X0cA@v!P;Y~9RrW&MdUeL=k~DxZf}* z$=$!sF3?t_x9nrQ>3^#8EhB^?DPy0O9)an}Ps^){*yZjWMHeg)ZwvHKUl6|P1zXgO z_x!rH`k9B`9M6U{IHM9O$f#t!CpEk%w!iDAqns_7pOq5@dv-n%Q5$5?8$IhkLGc|A zMnjy#7ckX8xkkVp*l1UDxyQ5y&E?3QKSLsoLI!U6+}{$AI|s+n z$%(G6CK2o|W}olCpYMcUukT6}$G*pJ*VH!nyXCjx#X8RdM&9{bXCrk`*&$h;<9HQX zR+iD1vbEvV!`J$*(_7#1K30US=)Qf0tJ|#8xx-^1GW&LO+K5hM;F&B^A!cPk2QHA6 zvc;4?%J?2?KvmO2dKL^Y@2ORGm(a|VpxP8guT4!J72(qb+P7mC7-vO%8zsfY`PSau zBF}=erMm7j+wZa+IxHpw6-uPx)Yp#N6W^ATDg>O9Dw+kIcALqY?fP@A6_YD?`om@Q zHBxj}i968E5$s2aI)X5(`!!^W)cVIW#7n8(SCz%ODp57;;5uP>h~~GHcPs=w6WYUc zDvH3QvWji1exvrZUAZM^oBZl_nY37&I$3UNDqL`1!F?^gMv*linwyVN!7sm?9<3%2 znvT>aAsv#YGy45AJ=>p1JkR`3^BGgIN(A2#YZJ*62dz4EUm27Hnp>+`Sz88ZC1crb zvml4ybvNzs50H6MEGULdcNZ?GnSUJ>JQLp^Ze;8$$xoFw-%AKonyli6DGZTIy>=p9 zC$DNUG8)VZDkRCjfQnp-Cx$V|W zz!hgZO@BJExlADLwW4-%8fQovUWFBYbJ?{qI?t$tenwoe?kiyiO&zMA9d<^Y)iI8~ zX(ETM6v_)Zp9^!A(m2oY?=Vn*wGsZ_ADeg`nAR>SxAxQpFodSmSH6SgBYr4S30BTi z&r&qP#8xfRvZ?coFmz1U3L1lXdK{6VA{j8HEZDzHvmhX7S!}!^^}H!_7)2GZTck6t zYjq@BSCwBU;eUm&@^Ty^tSS|Q^jqu?6;39(N;WdyPguYWjw~Jzx9DrA*E7~QS@oM& zns2>cTwy!Q)4hReKT^N&0%c<;?d{^WNLwoegw6w@xPM(jFG;R z%jtcSc(?KARj#o-)=%;dZpDRcRp^NrEpNllbHei%21`}}xd3H124&p!m@82-OydvC z2t!Mdt3y(R@_xWD2yus`lgqbx8CWINJR|+`_*DiIrFjn20LjXrX3aqfB_v%1UonEc z>eSgEr$bwixj$@k))pHeO8rU_&)Xu$Pu!Yft>IClGo>o| z!qKIYdgrP5U3)ioNb>>p8!Mb0<*>7KcDHoYl(RBuB5Fl%yY)9|mPtOjCd{kajf1Uv zK^oPrphnGuIPsuHbX){V?zif7{FzM*^rgL2w7Wlt>q$rTP|Rnx9kxGiDJ-4;sBO|G zskLaT7%`AR36mYFXB?^*loGJ<4trTUKjW{f9ufIoqFnuJNV-L`MWLMfl1?p-<-)#d z{cPA5=p1M_%OrMm|EuJQhUm&t3~k<{CZuXw@+!YELc_eV3BE=mw)lC@Bp)@!(=VT8 zlgl`oX4JN-3UyQB&nnB$>(xD#C*O$#GR;v7!=B)A^@s>KuKlKG^C?MX_;bJP*3J?);M}hqe4RNqHDygTFcFrjOsmv2N2r?gT`VaD_G5-P7*ci0;MRU#?Uh z|L_(tT75!WLiDv0)9n>$?MK_{k0qEF_#fNOS#GBo4>i2ma6%kBxInVx$~kdI31VK) zw#H69Iy>ZJDaV?tMoBWfhOicn283B7`qN?I^wQNM?;hVxg3Acxp4aN4Ar10?^q4-~bj={zKq{p96s>Ae$+G4(?h2#K6bK{|pf%{At7kORWG*P}341hG>G0 zmVg4F-u~YS;DIH8LkTZ(X(Rm4kMw^q0)kIh0;HQ96t;rr#mCaUgV~l;E zm^DBFo?8KO5Lxh*HDC-d4gD2JNEjmhJCOar``;h1zYPF?L7VumAMi&s z46q0a*NS8dchn1{v;}w{!lG>p4@=Za0^|x0jIf1k`}%+r$XD?|eJ8lU30weZS^GO7 zfn=Q&G_;44N_KD(*o2e#puZg;1u*VDXjrlXQ~;&@zkEXFKFQz7OhEb}yib^LcS!z^ z@Ps{F`17xaN$4k(|B&1ca5DSsua<=Svwz7XI0>l-zc>K;z|{5MF_6{~kOrK8Kg=Uv zeVVZN8|xoG%N^kg%R}NF#co z{M);80u&I-(f;kBp+o-lA>JA8!wc}UGoTE43*I^dtPgQxb%DoiiW!u10XTsxX3)r!PA)p(hIh`0(^)^5)e>O5<&x#y8Um? zPRW2WZt&f^l>r0X04cD4o@YTo!2CZ+ zO$O$A0m8t~IB?Yqc=hZg9)e(N|K8MwUEK)8@`RtV*WM2cG4KWyA<KB= z@YO4Tp}v3s#0~87g Date: Thu, 5 Dec 2024 15:42:05 +0300 Subject: [PATCH 2/8] Use Cell class in tabby --- .../pdf_reader/data_classes/tables/cell.py | 7 +++++-- .../pdf_txtlayer_reader/pdf_tabby_reader.py | 20 +++++++++++-------- .../test_api_misc_multipage_table.py | 6 ++---- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py index 0d42dc37..8665eeaa 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/cell.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/cell.py @@ -25,6 +25,9 @@ def copy_from(cell: "Cell", y_bottom_right=y_bottom_right, id_con=cell.id_con, lines=cell.lines, + colspan=cell.colspan, + rowspan=cell.rowspan, + invisible=cell.invisible, is_attribute=cell.is_attribute, is_attribute_required=cell.is_attribute_required, rotated_angle=cell.rotated_angle, @@ -44,7 +47,7 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None, is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None, - contour_coord: Optional[BBox] = None) -> None: + contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None: import uuid @@ -52,7 +55,7 @@ def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bott assert y_top_left <= y_bottom_right self.lines = [] if lines is None else lines - super().__init__(lines) + super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible) self.x_top_left = x_top_left self.x_bottom_right = x_bottom_right diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 1d0d594d..9e258b5e 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -159,7 +159,7 @@ def __save_gost_frame_boxes_to_json(self, first_page: Optional[int], last_page: def __get_tables(self, page: dict) -> List[ScanTable]: import uuid - from dedoc.data_structures.cell_with_meta import CellWithMeta + from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.line_metadata import LineMetadata @@ -188,15 +188,19 @@ def __get_tables(self, page: dict) -> List[ScanTable]: cell_bbox = BBox(x_top_left=int(c["x_top_left"]), y_top_left=int(c["y_top_left"]), width=int(c["width"]), height=int(c["height"])) annotations.append(BBoxAnnotation(c["start"], c["end"], cell_bbox, page_width=page_width, page_height=page_height)) """ - TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable" - https://jira.intra.ispras.ru/browse/TLDR-851 + TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable" + https://jira.intra.ispras.ru/browse/TLDR-851 """ - - result_row.append(CellWithMeta( + current_cell_properties = cell_properties[num_row][num_col] + result_row.append(Cell( lines=[LineWithMeta(line=cell["text"], metadata=LineMetadata(page_id=page_number, line_id=0), annotations=annotations)], - colspan=cell_properties[num_row][num_col]["col_span"], - rowspan=cell_properties[num_row][num_col]["row_span"], - invisible=bool(cell_properties[num_row][num_col]["invisible"]) + colspan=current_cell_properties["col_span"], + rowspan=current_cell_properties["row_span"], + invisible=bool(current_cell_properties["invisible"]), + x_top_left=int(current_cell_properties["x_top_left"]), + x_bottom_right=int(current_cell_properties["x_top_left"]) + int(current_cell_properties["width"]), + y_top_left=int(current_cell_properties["y_top_left"]), + y_bottom_right=int(current_cell_properties["y_top_left"]) + int(current_cell_properties["height"]) )) cells.append(result_row) diff --git a/tests/api_tests/test_api_misc_multipage_table.py b/tests/api_tests/test_api_misc_multipage_table.py index 5c3c0d2e..c7431247 100644 --- a/tests/api_tests/test_api_misc_multipage_table.py +++ b/tests/api_tests/test_api_misc_multipage_table.py @@ -47,12 +47,10 @@ def test_api_ml_table_recognition_synthetic_data_1(self) -> None: def test_api_ml_table_recognition_synthetic_data_3(self) -> None: file_name = "example_mp_table_with_repeate_header_2.pdf" - for pdf_param in ["false", "true"]: - # for "tabby" doesn't work because need to unify the output of table in matrix form and set attribute cells, - # without this tables won't be merge. + for pdf_param in ["false", "true", "tabby"]: tables = self._get_tables(file_name, pdf_with_text_layer=pdf_param) - self.assertEqual(len(tables), 1) + self.assertEqual(len(tables), 1, f"Error when pdf_with_text_layer={pdf_param}") table = tables[0]["cells"] self.assertListEqual( From ccf6d150afebf458775040aaad38231f35ed83b6 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Tue, 10 Dec 2024 19:35:56 +0300 Subject: [PATCH 3/8] TLDR-861 greate refactor table; improve merging tabby tables --- dedoc/data_structures/cell_with_meta.py | 5 +- .../pdf_reader/data_classes/tables/cell.py | 23 ++-- .../data_classes/tables/scantable.py | 87 +++----------- dedoc/readers/pdf_reader/pdf_base_reader.py | 19 +--- .../pdf_image_reader/pdf_image_reader.py | 2 - .../multipage_table_extractor.py | 26 ++--- .../onepage_table_extractor.py | 107 +++++------------- .../table_attribute_extractor.py | 52 ++++----- .../table_recognizer/table_recognizer.py | 29 +---- .../table_utils/accuracy_table_rec.py | 4 +- .../table_utils/img_processing.py | 8 +- .../pdf_txtlayer_reader/pdf_tabby_reader.py | 15 ++- .../pdf_txtlayer_reader.py | 6 +- .../test_api_module_table_recognizer.py | 16 --- .../test_module_gost_frame_recognizer.py | 2 - .../unit_tests/test_module_table_detection.py | 71 ++++++------ 16 files changed, 153 insertions(+), 319 deletions(-) diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py index d23cad1c..1ef652b0 100644 --- a/dedoc/data_structures/cell_with_meta.py +++ b/dedoc/data_structures/cell_with_meta.py @@ -47,9 +47,8 @@ def get_annotations(self) -> List[Annotation]: """ return LineWithMeta.join(lines=self.lines, delimiter="\n").annotations - @staticmethod - def create_from_cell(cell: "CellWithMeta") -> "CellWithMeta": - return CellWithMeta(lines=cell.lines, colspan=cell.colspan, rowspan=cell.rowspan, invisible=cell.invisible) + def __str__(self) -> str: + return f"CellWithMeta((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})" def to_api_schema(self) -> ApiCellWithMeta: import numpy as np diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py index 8665eeaa..effd58c0 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/cell.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/cell.py @@ -2,7 +2,6 @@ from dedocutils.data_structures import BBox -from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.data_structures.line_with_meta import LineWithMeta @@ -19,6 +18,9 @@ def copy_from(cell: "Cell", x_bottom_right = cell.x_bottom_right if x_bottom_right is None else x_bottom_right y_top_left = cell.y_top_left if y_top_left is None else y_top_left y_bottom_right = cell.y_bottom_right if y_bottom_right is None else y_bottom_right + + # TODO change x_top_left ... y_bottom_right to BBox + return Cell(x_top_left=x_top_left, x_bottom_right=x_bottom_right, y_top_left=y_top_left, @@ -31,7 +33,7 @@ def copy_from(cell: "Cell", is_attribute=cell.is_attribute, is_attribute_required=cell.is_attribute_required, rotated_angle=cell.rotated_angle, - uid=cell.cell_uid, + uid=cell.uuid, contour_coord=cell.con_coord) def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None: @@ -46,7 +48,7 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) self.con_coord.shift(shift_x=shift_x, shift_y=shift_y) def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None, - is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None, + is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = Optional[None], contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None: import uuid @@ -57,25 +59,20 @@ def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bott self.lines = [] if lines is None else lines super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible) + # TODO change to BBox self.x_top_left = x_top_left self.x_bottom_right = x_bottom_right self.y_top_left = y_top_left self.y_bottom_right = y_bottom_right + self.id_con = id_con + self.is_attribute = is_attribute self.is_attribute_required = is_attribute_required self.rotated_angle = rotated_angle - self.cell_uid = f"cell_{uuid.uuid1()}" if uid is None else uid - self.con_coord = contour_coord or BBox(0, 0, 0, 0) - - def __str__(self) -> str: - return f"Cell((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})" - def get_text(self) -> str: - return "\n".join([line.line for line in self.lines]) - - def get_annotations(self) -> List[Annotation]: - return LineWithMeta.join(self.lines, delimiter="\n").annotations + self.uuid = uuid.uuid4() if uuid is None else uid + self.con_coord = contour_coord or BBox(0, 0, 0, 0) def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None: from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py index be812630..e8010886 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py @@ -1,4 +1,4 @@ -from typing import Any, List, Optional +from typing import List from dedocutils.data_structures import BBox @@ -9,93 +9,36 @@ from dedoc.readers.pdf_reader.data_classes.tables.location import Location -class ScanTable: - def __init__(self, page_number: int, matrix_cells: Optional[List[List[CellWithMeta]]] = None, bbox: Optional[BBox] = None, - name: str = "", order: int = -1) -> None: - self.matrix_cells = matrix_cells - self.page_number = page_number - self.locations = [] - self.name = name +class ScanTable(Table): + def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None: + + super().__init__(cells, TableMetadata(page_id=page_number)) self.order = order - if bbox is not None: - self.locations.append(Location(page_number, bbox)) + self.locations = [Location(page_number, bbox)] def extended(self, table: "ScanTable") -> None: # extend locations self.locations.extend(table.locations) # extend values - self.matrix_cells.extend(table.matrix_cells) + self.cells.extend(table.cells) # extend order self.order = max(self.order, table.order) def check_on_cell_instance(self) -> bool: - if len(self.matrix_cells) == 0: + if len(self.cells) == 0: return False - if len(self.matrix_cells[0]) == 0: + if len(self.cells[0]) == 0: return False - if not isinstance(self.matrix_cells[0][0], Cell): + if not isinstance(self.cells[0][0], Cell): return False return True def to_table(self) -> Table: - metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle) - cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells] - return Table(metadata=metadata, cells=cells_with_meta) - - @staticmethod - def get_cells_text(attr_cells: List[List[Cell]]) -> List[List[str]]: - attrs = [] - for i in range(0, len(attr_cells)): - attrs.append([a.get_text() for a in attr_cells[i]]) - - return attrs - - @staticmethod - def get_key_value_attrs(attrs: List, val: Any) -> dict: # noqa - res_attrs = [] - for i in range(0, len(attrs)): - res_attrs.append({"attr": attrs[i]}) - res = { - "attrs": res_attrs, - "val": val - } - return res - - @staticmethod - def get_index_of_end_string_attr(matrix_cells: List[List[Cell]]) -> int: - end_attr_string = 0 - for i in range(0, len(matrix_cells)): - if matrix_cells[i][0].is_attribute: - end_attr_string = i - - return end_attr_string + return super() @staticmethod - def get_attributes_cell(matrix_cells: List[List[Cell]]) -> (List[int], List[List[Cell]], int): - import copy - import numpy as np - - required_columns = [] - for j in range(0, len(matrix_cells[0])): - if matrix_cells[0][j].is_attribute_required: - required_columns.append(j) - - end_attr_string = ScanTable.get_index_of_end_string_attr(matrix_cells) - - attrs = copy.deepcopy(np.array(matrix_cells[0:end_attr_string + 1])) - attrs = attrs.transpose().tolist() - - return [required_columns, attrs, end_attr_string] - - @staticmethod - def get_matrix_attrs_and_data(matrix_cells: List[List[Cell]]) -> (List[List[Cell]], List[List[str]], List[List[str]]): - required_columns, attrs, end_attr_string = ScanTable.get_attributes_cell(matrix_cells) - attrs_text = ScanTable.get_cells_text(attrs) - - data = matrix_cells[(end_attr_string + 1):] - data_text = ScanTable.get_cells_text(data) - - return [attrs, attrs_text, data_text] + def get_cells_text(cells: List[List[CellWithMeta]]) -> List[List[str]]: + return [[cell.get_text() for cell in row] for row in cells] @property def location(self) -> Location: @@ -103,12 +46,12 @@ def location(self) -> Location: @property def uid(self) -> str: - return self.name + return self.metadata.uid def to_dict(self) -> dict: from collections import OrderedDict - data_text = ScanTable.get_cells_text(self.matrix_cells) + data_text = ScanTable.get_cells_text(self.cells) res = OrderedDict() res["locations"] = [location.to_dict() for location in self.locations] diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 4fd9fdec..60ccd865 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -15,8 +15,6 @@ ParametersForParseDoc = namedtuple("ParametersForParseDoc", [ - "orient_analysis_cells", - "orient_cell_angle", "is_one_column_document", "document_orientation", "language", @@ -73,8 +71,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure params_for_parse = ParametersForParseDoc( language=param_utils.get_param_language(parameters), - orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters), - orient_cell_angle=param_utils.get_param_orient_cell_angle(parameters), is_one_column_document=param_utils.get_param_is_one_column_document(parameters), document_orientation=param_utils.get_param_document_orientation(parameters), need_header_footers_analysis=param_utils.get_param_need_header_footers_analysis(parameters), @@ -177,7 +173,7 @@ def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[Scan table_page_number = location.page_number location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left) page_number = scan_table.locations[0].page_number - for row in scan_table.matrix_cells: + for row in scan_table.cells: for cell in row: image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0] shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left) @@ -275,16 +271,3 @@ def _binarization(self, gray_image: ndarray) -> ndarray: binary_mask = gray_image >= np.quantile(gray_image, 0.05) gray_image[binary_mask] = 255 return gray_image - - def eval_tables_by_batch(self, - batch: Iterator[ndarray], - page_number_begin: int, - language: str, - orient_analysis_cells: bool = False, - orient_cell_angle: int = 270, - table_type: str = "") -> Tuple[List[ndarray], List[ScanTable]]: - from joblib import Parallel, delayed - - result_batch = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.table_recognizer.recognize_tables_from_image)( - image, page_number_begin + i, language, orient_analysis_cells, orient_cell_angle, table_type) for i, image in enumerate(batch)) - return result_batch diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index 64d96fe6..e53ba9e3 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -85,8 +85,6 @@ def _process_one_page(self, image=rotated_image, page_number=page_number, language=parameters.language, - orient_analysis_cells=parameters.orient_analysis_cells, - orient_cell_angle=parameters.orient_cell_angle, table_type=parameters.table_type ) else: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py index 06abe0c2..5cff352d 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py @@ -21,11 +21,11 @@ def extract_multipage_tables(self, single_tables: List[ScanTable], lines_with_me self.single_tables = single_tables multipages_tables = [] list_page_with_tables = [] - total_pages = max((table.page_number + 1 for table in single_tables), default=0) + total_pages = max((table.location.page_number + 1 for table in single_tables), default=0) for cur_page in range(total_pages): # 1. get possible diapason of neighbors pages with tables # pages distribution - list_mp_table = [t for t in self.single_tables if t.page_number == cur_page] + list_mp_table = [t for t in self.single_tables if t.location.page_number == cur_page] list_page_with_tables.append(list_mp_table) total_cur_page = 0 @@ -86,7 +86,7 @@ def __handle_multipage_table(self, # t2 is merged with t1 t1.extended(t2) list_page_with_tables[cur_page].pop(0) - self.__delete_ref_table(lines=lines_with_meta, table_name=t2.name) + self.__delete_ref_table(lines=lines_with_meta, table_name=t2.uid) else: if len(list_page_with_tables[cur_page]) > 0: cur_page -= 1 # analysis from the current page, not the next one @@ -118,8 +118,8 @@ def __get_width_cell_wo_separating(row: List[Cell]) -> List[int]: for cell_id, cell in enumerate(row): if prev_uid is None: start = cell.x_top_left - prev_uid = cell.cell_uid - elif prev_uid != cell.cell_uid: + prev_uid = cell.uuid + elif prev_uid != cell.uuid: widths.append(end - start) start = cell.x_top_left end = cell.x_bottom_right @@ -154,28 +154,28 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool: return False # condition 2. Exclusion of the duplicated header (if any) - attr1 = TableAttributeExtractor.get_header_table(t1.matrix_cells) - attr2 = TableAttributeExtractor.get_header_table(t2.matrix_cells) + attr1 = TableAttributeExtractor.get_header_table(t1.cells) + attr2 = TableAttributeExtractor.get_header_table(t2.cells) t2_update = copy.deepcopy(t2) if TableAttributeExtractor.is_equal_attributes(attr1, attr2): - t2_update.matrix_cells = t2_update.matrix_cells[len(attr2):] + t2_update.cells = t2_update.cells[len(attr2):] - if len(t2_update.matrix_cells) == 0 or len(t1.matrix_cells) == 0: + if len(t2_update.cells) == 0 or len(t1.cells) == 0: return False - TableAttributeExtractor.clear_attributes(t2_update.matrix_cells) + TableAttributeExtractor.clear_attributes(t2_update.cells) # condition 3. Number of columns should be equal - if len(t1.matrix_cells[-1]) != len(t2_update.matrix_cells[0]): + if len(t1.cells[-1]) != len(t2_update.cells[0]): if self.config.get("debug_mode", False): self.logger.debug("Different count column") return False # condition 4. Comparison of the widths of last and first rows - if t1.check_on_cell_instance() and t2_update.check_on_cell_instance() and not self.__is_equal_width_cells(t1.matrix_cells, t2_update.matrix_cells): + if t1.check_on_cell_instance() and t2_update.check_on_cell_instance() and not self.__is_equal_width_cells(t1.cells, t2_update.cells): if self.config.get("debug_mode", False): self.logger.debug("Different width columns") return False - t2.matrix_cells = copy.deepcopy(t2_update.matrix_cells) # save changes + t2.cells = copy.deepcopy(t2_update.cells) # save changes return True diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py index c946cccf..2a05a03c 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py @@ -1,10 +1,10 @@ import copy import logging -import uuid from typing import List import numpy as np +from dedoc.common.exceptions.recognize_error import RecognizeError from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree @@ -29,20 +29,12 @@ def __init__(self, *, config: dict, logger: logging.Logger) -> None: self.table_options = TableTypeAdditionalOptions() self.language = "rus" - def extract_onepage_tables_from_image(self, - image: np.ndarray, - page_number: int, - language: str, - orient_analysis_cells: bool, - orient_cell_angle: int, # TODO remove - table_type: str) -> List[ScanTable]: + def extract_onepage_tables_from_image(self, image: np.ndarray, page_number: int, language: str, table_type: str) -> List[ScanTable]: """ extracts tables from input image :param image: input gray image :param page_number: :param language: language for Tesseract - :param orient_analysis_cells: need or not analyse orientations of cells - :param orient_cell_angle: angle of cells (needs if orient_analysis_cells==True) :return: List[ScanTable] """ self.image = image @@ -50,72 +42,24 @@ def extract_onepage_tables_from_image(self, self.language = language # Read the image - tables_tree, contours, angle_rotate = detect_tables_by_contours(image, - language=language, - config=self.config, - orient_analysis_cells=orient_analysis_cells, - table_type=table_type) - + tables_tree, contours, angle_rotate = detect_tables_by_contours(image, language=language, config=self.config, table_type=table_type) tables = self.__build_structure_table_from_tree(tables_tree=tables_tree, table_type=table_type) - for matrix in tables: - for location in matrix.locations: + for table in tables: + for location in table.locations: location.bbox.rotate_coordinates(angle_rotate=-angle_rotate, image_shape=image.shape) location.rotated_angle = angle_rotate - tables = self.__select_attributes_matrix_tables(tables=tables) + tables = self.__select_attributes_tables(tables=tables) return tables - """ TODO fix in the future (REMOVE) - def __detect_diff_orient(self, cell_text: str) -> bool: - # 1 - разбиваем на строки длины которых состоят хотя бы из одного символа - parts = cell_text.split("\n") - parts = [p for p in parts if len(p) > 0] - - # 2 - подсчитываем среднюю длину строк ячейки - len_parts = [len(p) for p in parts] - avg_len_part = np.average(len_parts) - - # Эвристика: считаем что ячейка повернута, если у нас большое количество строк и строки короткие - if len(parts) > TableTree.minimal_cell_cnt_line \ - and avg_len_part < TableTree.minimal_cell_avg_length_line: - return True - return False - - def __correct_orient_cell(self, cell: Cell, language: str, rotated_angle: int) -> [Cell, np.ndarray]: - img_cell = self.image[cell.y_top_left: cell.y_bottom_right, cell.x_top_left: cell.x_bottom_right] - rotated_image_cell = rotate_image(img_cell, -rotated_angle) - - output_dict = get_text_with_bbox_from_cells(img_cell, language=language) - line_boxes = [ - TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(width, height)) - for line_num, line in enumerate(output_dict.lines)] - # get_cell_text_by_ocr(rotated_image_cell, language=language) - cell.set_rotated_angle(rotated_angle=-rotated_angle) - return cell, rotated_image_cell - - - def __analyze_header_cell_with_diff_orient(self, tables: List[ScanTable], language: str, - rotated_angle: int) -> List[ScanTable]: - + def __select_attributes_tables(self, tables: List[ScanTable]) -> List[ScanTable]: for table in tables: - attrs = TableAttributeExtractor.get_header_table(table.matrix_cells) - for i, row in enumerate(attrs): - for j, attr in enumerate(row): - if self.__detect_diff_orient(attr.text): - rotated_cell, rotated_image = self.__correct_orient_cell(attr, language=language, rotated_angle=rotated_angle) - table.matrix_cells[i][j] = rotated_cell - - return tables - """ - - def __select_attributes_matrix_tables(self, tables: List[ScanTable]) -> List[ScanTable]: - for matrix in tables: - matrix = self.attribute_selector.select_attributes(matrix) + table = self.attribute_selector.set_attributes(table) if self.config.get("debug_mode", False): - self._print_table_attr(matrix.matrix_cells) + self._print_table_attr(table.cells) return tables @@ -146,7 +90,7 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable: for i in range(0, len(matrix)): matrix[i] = sorted(matrix[i], key=lambda cell: cell.x_top_left, reverse=False) - matrix_table = ScanTable(matrix_cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number, name=str(uuid.uuid4())) + matrix_table = ScanTable(cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number) return matrix_table @@ -157,19 +101,28 @@ def __build_structure_table_from_tree(self, tables_tree: TableTree, table_type: tables = [] for table_tree in tables_tree.children: try: - cur_table = self.__get_matrix_table_from_tree(table_tree) - # Эвристика 1: Таблица должна состоять из 1 строк и более - if len(cur_table.matrix_cells) > 0: - cur_table.matrix_cells = self.splitter.split(cells=cur_table.matrix_cells) - - # Эвристика 2: таблица должна иметь больше одного столбца - if len(cur_table.matrix_cells[0]) > 1 or (self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []): - tables.append(cur_table) - - if self.table_options.split_last_column in table_type: - cur_table.matrix_cells = split_last_column(cur_table.matrix_cells, language=self.language, image=self.image) + table = self.__get_matrix_table_from_tree(table_tree) + table.cells = self.handle_cells(table.cells, table_type) + tables.append(table) except Exception as ex: self.logger.warning(f"Warning: unrecognized table into page {self.page_number}. {ex}") if self.config.get("debug_mode", False): raise ex return tables + + def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[List[Cell]]: + # Эвристика 1: Таблица должна состоять из 1 строк и более + if len(cells) < 1: + raise RecognizeError("Invalid recognized table") + + cells = self.splitter.split(cells=cells) + + # Эвристика 2: таблица должна иметь больше одного столбца + if cells[0] == [] or (len(cells[0]) <= 1 and self.table_options.detect_one_cell_table not in table_type): + raise RecognizeError("Invalid recognized table") + + # Postprocess table + if self.table_options.split_last_column in table_type: + cells = split_last_column(cells, language=self.language, image=self.image) + + return cells diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py index f13f0eec..fbca8cd0 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py @@ -6,7 +6,7 @@ from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import similarity -class TableAttributeExtractor(object): +class TableAttributeExtractor: """ Class finds and labels "is_attributes=True" attribute cells into ScanTable """ @@ -14,7 +14,7 @@ class TableAttributeExtractor(object): def __init__(self, logger: logging.Logger) -> None: self.logger = logger - def select_attributes(self, scan_table: ScanTable) -> ScanTable: + def set_attributes(self, scan_table: ScanTable) -> ScanTable: return self.__set_attributes_for_type_top(scan_table) @staticmethod @@ -104,21 +104,21 @@ def __is_empty_row(self, matrix_table: List[List[Cell]], row_index: int) -> bool def __analyze_attr_for_vertical_union_columns(self, scan_table: ScanTable) -> List[int]: vertical_union_columns = [] - if len(vertical_union_columns) != 0 and len(scan_table.matrix_cells) > 1: + if len(vertical_union_columns) != 0 and len(scan_table.cells) > 1: self.logger.debug("ATTR_TYPE: vertical union table") row_max_attr = 1 i = 1 # Установка атрибутов таблицы for i in range(0, row_max_attr): - for j in range(0, len(scan_table.matrix_cells[i])): - scan_table.matrix_cells[i][j].is_attribute = True + for j in range(0, len(scan_table.cells[i])): + scan_table.cells[i][j].is_attribute = True # Установка обязательных атрибутов - scan_table.matrix_cells[0][0].is_attribute_required = True - for j in range(1, len(scan_table.matrix_cells[0])): + scan_table.cells[0][0].is_attribute_required = True + for j in range(1, len(scan_table.cells[0])): is_attribute_required = True if is_attribute_required: - scan_table.matrix_cells[0][j].is_attribute_required = True + scan_table.cells[0][j].is_attribute_required = True return vertical_union_columns @@ -126,48 +126,48 @@ def __analyze_attr_for_horizontal_union_raws(self, scan_table: ScanTable) -> Lis horizontal_union_rows = [] union_first = False - for i in range(0, len(scan_table.matrix_cells)): + for i in range(0, len(scan_table.cells)): if len(horizontal_union_rows) > 0 and i not in horizontal_union_rows: horizontal_union_rows.append(i) - if not self.__is_empty_row(scan_table.matrix_cells, i): + if not self.__is_empty_row(scan_table.cells, i): break if union_first and len(horizontal_union_rows) != 0: self.logger.debug("ATTR_TYPE: horizontal_union_rows") for i in range(0, len(horizontal_union_rows)): - for j in range(0, len(scan_table.matrix_cells[i])): - scan_table.matrix_cells[i][j].is_attribute = True - scan_table.matrix_cells[0][0].is_attribute_required = True + for j in range(0, len(scan_table.cells[i])): + scan_table.cells[i][j].is_attribute = True + scan_table.cells[0][0].is_attribute_required = True first_required_column = 0 # search indexable_column # один один столбец должен быть (0) - нумерованным, # один (1) - с обязательными поляями, один (2) - с необязательными # поэтому len(matrix_table) > first_required_column + 2 if len(horizontal_union_rows) > 0 and \ - self.__is_indexable_column(scan_table.matrix_cells, first_required_column, max_raw_of_search=horizontal_union_rows[-1]) \ - and len(scan_table.matrix_cells) > first_required_column + 2: - scan_table.matrix_cells[0][first_required_column + 1].is_attribute_required = True + self.__is_indexable_column(scan_table.cells, first_required_column, max_raw_of_search=horizontal_union_rows[-1]) \ + and len(scan_table.cells) > first_required_column + 2: + scan_table.cells[0][first_required_column + 1].is_attribute_required = True # Полностью пустые строки не могут быть атрибутами (не информативны) # Перенос атрибутов на след строку таблицы index_empty_rows = horizontal_union_rows[-1] - if self.__is_empty_row(scan_table.matrix_cells, index_empty_rows) and len(scan_table.matrix_cells) != index_empty_rows + 1: + if self.__is_empty_row(scan_table.cells, index_empty_rows) and len(scan_table.cells) != index_empty_rows + 1: horizontal_union_rows.append(index_empty_rows + 1) - for j in range(0, len(scan_table.matrix_cells[index_empty_rows + 1])): - scan_table.matrix_cells[index_empty_rows + 1][j].is_attribute = True + for j in range(0, len(scan_table.cells[index_empty_rows + 1])): + scan_table.cells[index_empty_rows + 1][j].is_attribute = True self.logger.debug("detect empty attributes row") return horizontal_union_rows def __analyze_attr_for_simple_table(self, scan_table: ScanTable) -> None: self.logger.debug("ATTR_TYPE: simple table") - for j in range(0, len(scan_table.matrix_cells[0])): - scan_table.matrix_cells[0][j].is_attribute = True + for j in range(0, len(scan_table.cells[0])): + scan_table.cells[0][j].is_attribute = True # set first required column j = 0 first_required_column = j - while j < len(scan_table.matrix_cells[0]): - if not self.__is_empty_column(scan_table.matrix_cells, j): - scan_table.matrix_cells[0][j].is_attribute_required = True + while j < len(scan_table.cells[0]): + if not self.__is_empty_column(scan_table.cells, j): + scan_table.cells[0][j].is_attribute_required = True first_required_column = j break j += 1 @@ -175,5 +175,5 @@ def __analyze_attr_for_simple_table(self, scan_table: ScanTable) -> None: # один один столбец должен быть (0) - нумерованным, # один (1) - с обязательными поляями, один (2) - с необязательными # поэтому len(matrix_table) > first_required_column + 2 - if self.__is_indexable_column(scan_table.matrix_cells, first_required_column, 0) and len(scan_table.matrix_cells) > first_required_column + 2: - scan_table.matrix_cells[0][first_required_column + 1].is_attribute_required = True + if self.__is_indexable_column(scan_table.cells, first_required_column, 0) and len(scan_table.cells) > first_required_column + 2: + scan_table.cells[0][first_required_column + 1].is_attribute_required = True diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py index c1124ca4..3d2f89ea 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py @@ -30,25 +30,13 @@ def __init__(self, *, config: dict = None) -> None: self.table_type = TableTypeAdditionalOptions() def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines_with_meta: List[LineWithMeta]) -> List[ScanTable]: - multipage_tables = self.multipage_tables_extractor.extract_multipage_tables(single_tables=all_single_tables, lines_with_meta=lines_with_meta) return multipage_tables - def recognize_tables_from_image(self, - image: np.ndarray, - page_number: int, - language: str, - orient_analysis_cells: bool, - orient_cell_angle: int, - table_type: str = "") -> Tuple[np.ndarray, List[ScanTable]]: + def recognize_tables_from_image(self, image: np.ndarray, page_number: int, language: str, table_type: str = "") -> Tuple[np.ndarray, List[ScanTable]]: self.logger.debug(f"Page {page_number}") try: - cleaned_image, matrix_tables = self.__rec_tables_from_img(image, - page_num=page_number, - language=language, - orient_analysis_cells=orient_analysis_cells, - orient_cell_angle=orient_cell_angle, - table_type=table_type) + cleaned_image, matrix_tables = self.__rec_tables_from_img(image, page_num=page_number, language=language, table_type=table_type) return cleaned_image, matrix_tables except Exception as ex: logging.warning(ex) @@ -56,22 +44,15 @@ def recognize_tables_from_image(self, raise ex return image, [] - def __rec_tables_from_img(self, - src_image: np.ndarray, - page_num: int, - language: str, - orient_analysis_cells: bool, - orient_cell_angle: int, - table_type: str) -> Tuple[np.ndarray, List[ScanTable]]: + def __rec_tables_from_img(self, src_image: np.ndarray, page_num: int, language: str, table_type: str) -> Tuple[np.ndarray, List[ScanTable]]: gray_image = cv2.cvtColor(src_image, cv2.COLOR_BGR2GRAY) if len(src_image.shape) == 3 else src_image single_page_tables = self.onepage_tables_extractor.extract_onepage_tables_from_image( image=gray_image, page_number=page_num, language=language, - orient_analysis_cells=orient_analysis_cells, - orient_cell_angle=orient_cell_angle, table_type=table_type) + if self.config.get("labeling_mode", False): self.__save_tables(tables=single_page_tables, image=src_image, table_path=self.config.get("table_path", "/tmp/tables")) if self.table_type.detect_one_cell_table in table_type: @@ -130,7 +111,7 @@ def __if_not_table(self, table: ScanTable, image: np.ndarray) -> bool: black_mean = (table_image < 225).mean() table_area = bbox.width * bbox.height cells_area = 0 - for row in table.matrix_cells: + for row in table.cells: for cell in row: cells_area += cell.width * cell.height diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py index f18b7505..4b7211b6 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py @@ -75,7 +75,7 @@ def calc_agreement(matrix_gt: List[List[Cell]], matrix: List[List[Cell]]) -> flo def draw_recognized_cell(tables: List[ScanTable], path_image: str, path_save: str) -> None: img = cv2.imread(path_image) for t_index in range(0, len(tables)): - table = tables[t_index].matrix_cells + table = tables[t_index].cells bbox = tables[t_index].locations.location blue_color, green_color, red_color = (255, 0, 0), (0, 255, 0), (0, 0, 255) cv2.rectangle(img, (bbox.x_top_left, bbox.y_top_left), (bbox.width, bbox.height), blue_color, 6) @@ -127,7 +127,7 @@ def calc_accuracy(path_image: str, path_gt_struct: str, path_gt_text: str, path_ elif len(tables) <= index_table: agreements.append(0) else: - agreement = calc_agreement(matrix_cell_gt, tables[index_table].matrix_cells) + agreement = calc_agreement(matrix_cell_gt, tables[index_table].cells) agreements.append(agreement) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py index c060d9d6..6bc12eab 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py @@ -246,15 +246,9 @@ def __paint_bounds(image: np.ndarray) -> np.ndarray: return image -def detect_tables_by_contours(img: np.ndarray, - language: str = "rus", - orient_analysis_cells: bool = False, - table_type: str = "", - *, - config: dict) -> [TableTree, List[np.ndarray], float]: +def detect_tables_by_contours(img: np.ndarray, language: str = "rus", table_type: str = "", *, config: dict) -> [TableTree, List[np.ndarray], float]: """ detecting contours and TreeTable with help contour analysis. TreeTable is - :param orient_analysis_cells: :param img: input image :param language: parameter language for Tesseract :param config: dict from config.py diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 9e258b5e..c927ab0e 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -14,6 +14,8 @@ from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor class PdfTabbyReader(PdfBaseReader): @@ -36,6 +38,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.jar_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "tabbypdf", "jars")) self.java_not_found_error = "`java` command is not found from this Python process. Please ensure Java is installed and PATH is set for `java`" self.default_config = {"JAR_PATH": os.path.join(self.jar_dir, self.jar_name)} + self.attribute_selector = TableAttributeExtractor(logger=self.logger) + self.table_extractor = OnePageTableExtractor(config=config, logger=self.logger) def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ @@ -158,7 +162,6 @@ def __save_gost_frame_boxes_to_json(self, first_page: Optional[int], last_page: return result_json_path def __get_tables(self, page: dict) -> List[ScanTable]: - import uuid from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.line_metadata import LineMetadata @@ -204,7 +207,15 @@ def __get_tables(self, page: dict) -> List[ScanTable]: )) cells.append(result_row) - scan_tables.append(ScanTable(page_number=page_number, matrix_cells=cells, bbox=table_bbox, name=str(uuid.uuid4()), order=order)) + try: + cells = self.table_extractor.handle_cells(cells) + table = ScanTable(page_number=page_number, cells=cells, bbox=table_bbox, order=order) + table = self.attribute_selector.set_attributes(table) + scan_tables.append(table) + except Exception as ex: + self.logger.warning(f"Warning: unrecognized table into page {self.page_number}. {ex}") + if self.config.get("debug_mode", False): + raise ex return scan_tables diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index 4cebbaf4..385f02a8 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -52,8 +52,6 @@ def _process_one_page(self, image=gray_image, page_number=page_number, language=parameters.language, - orient_analysis_cells=parameters.orient_analysis_cells, - orient_cell_angle=parameters.orient_cell_angle, table_type=parameters.table_type ) else: @@ -87,7 +85,7 @@ def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tup shift_x, shift_y = page_shift.x_top_left, page_shift.y_top_left # shift tables to original coordinates for location in table.locations: location.bbox.shift(shift_x=shift_x, shift_y=shift_y) - for row in table.matrix_cells: + for row in table.cells: for cell in row: cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) @@ -97,7 +95,7 @@ def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int """ for table in tables: - for row in table.matrix_cells: + for row in table.cells: for cell in row: cell.change_lines_boxes_page_width_height(new_page_width=pdf_width, new_page_height=pdf_height) diff --git a/tests/api_tests/test_api_module_table_recognizer.py b/tests/api_tests/test_api_module_table_recognizer.py index a73b4ee5..a6f48a70 100644 --- a/tests/api_tests/test_api_module_table_recognizer.py +++ b/tests/api_tests/test_api_module_table_recognizer.py @@ -1,6 +1,5 @@ import json import os -import unittest from typing import List from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -98,21 +97,6 @@ def _check_header_table(self, cells: List[dict]) -> None: self._check_similarity(row0[9], "Систетематический\nконтроль") self._check_similarity(row0[10], "Экспертная оценка") - @unittest.skip("TODO") - def test_api_table_recognition_with_diff_orient_cells_90(self) -> None: - file_name = "example_table_with_90_orient_cells.pdf" - response = self._send_request(file_name, dict(orient_analysis_cells=True, orient_cell_angle="90")) - table = response["content"]["tables"][0] - - self._check_header_table(table["cells"]) - - @unittest.skip - def test_api_table_recognition_with_diff_orient_cells_270(self) -> None: - file_name = "example_table_with_270_orient_cells.pdf" - response = self._send_request(file_name, dict(orient_analysis_cells=True, orient_cell_angle="270")) - table = response["content"]["tables"][0] - self._check_header_table(table["cells"]) - def test_pdf_table(self) -> None: file_name = "example_with_table1.pdf" result = self._send_request(file_name) diff --git a/tests/unit_tests/test_module_gost_frame_recognizer.py b/tests/unit_tests/test_module_gost_frame_recognizer.py index a2c33f09..1ac3a7c2 100644 --- a/tests/unit_tests/test_module_gost_frame_recognizer.py +++ b/tests/unit_tests/test_module_gost_frame_recognizer.py @@ -31,8 +31,6 @@ def _get_params_for_parse(self, parameters: Optional[dict], file_path: Optional[ file_path = file_path if file_path else "" params_for_parse = ParametersForParseDoc( language=param_utils.get_param_language(parameters), - orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters), - orient_cell_angle=param_utils.get_param_orient_cell_angle(parameters), is_one_column_document=param_utils.get_param_is_one_column_document(parameters), document_orientation=param_utils.get_param_document_orientation(parameters), need_header_footers_analysis=param_utils.get_param_need_header_footers_analysis(parameters), diff --git a/tests/unit_tests/test_module_table_detection.py b/tests/unit_tests/test_module_table_detection.py index 0aef1be0..39b1b4dc 100644 --- a/tests/unit_tests/test_module_table_detection.py +++ b/tests/unit_tests/test_module_table_detection.py @@ -21,12 +21,7 @@ class TestRecognizedTable(unittest.TestCase): table_recognizer = TableRecognizer(config=get_test_config()) def get_table(self, image: np.ndarray, language: str = "rus", table_type: str = "") -> List[ScanTable]: - image, tables = self.table_recognizer.recognize_tables_from_image(image=image, - page_number=0, - language=language, - orient_analysis_cells=False, - orient_cell_angle=0, - table_type=table_type) + image, tables = self.table_recognizer.recognize_tables_from_image(image=image, page_number=0, language=language, table_type=table_type) return tables def test_table_wo_external_bounds(self) -> None: @@ -50,13 +45,13 @@ def test_table_split_right_column(self) -> None: image = cv2.imread(path_image, 0) tables = self.get_table(image, "rus+eng", table_type="split_last_column+wo_external_bounds") - self.assertTrue(tables[0].matrix_cells[4][-1].get_text(), "40703978900000345077") - self.assertTrue(tables[0].matrix_cells[5][-1].get_text(), "049401814") - self.assertTrue(tables[0].matrix_cells[6][-1].get_text(), "30101810200000000814") - self.assertTrue(tables[0].matrix_cells[7][-1].get_text(), "049401814") - self.assertTrue(tables[0].matrix_cells[8][-1].get_text(), "30101810200000000814") - self.assertTrue(tables[0].matrix_cells[9][-1].get_text(), "30110978700000070815") - self.assertTrue(tables[0].matrix_cells[10][-1].get_text(), "30110978700000070815") + self.assertTrue(tables[0].cells[4][-1].get_text(), "40703978900000345077") + self.assertTrue(tables[0].cells[5][-1].get_text(), "049401814") + self.assertTrue(tables[0].cells[6][-1].get_text(), "30101810200000000814") + self.assertTrue(tables[0].cells[7][-1].get_text(), "049401814") + self.assertTrue(tables[0].cells[8][-1].get_text(), "30101810200000000814") + self.assertTrue(tables[0].cells[9][-1].get_text(), "30110978700000070815") + self.assertTrue(tables[0].cells[10][-1].get_text(), "30110978700000070815") def test_table_extract_one_cell_and_one_cell_tables(self) -> None: path_image = get_full_path("data/lising/platezhka.jpg") @@ -115,73 +110,73 @@ def test_table_recognition_1(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table3.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) self.assertEqual(cnt_rows, 8) self.assertEqual(cnt_columns, 3) self.assertEqual(cnt_a_cell, 3) self.assertEqual(cnt_cell, 24) - self.assertTrue(similarity(tables[0].matrix_cells[0][1].get_text(), "Наименование данных")) - self.assertTrue(similarity(tables[0].matrix_cells[0][2].get_text(), "Данные")) - self.assertTrue(similarity(tables[0].matrix_cells[4][1].get_text().capitalize(), "Инн")) - self.assertTrue(similarity(tables[0].matrix_cells[3][1].get_text(), "Руководитель (ФИО, телефон,\nфакс, электронный адрес)")) + self.assertTrue(similarity(tables[0].cells[0][1].get_text(), "Наименование данных")) + self.assertTrue(similarity(tables[0].cells[0][2].get_text(), "Данные")) + self.assertTrue(similarity(tables[0].cells[4][1].get_text().capitalize(), "Инн")) + self.assertTrue(similarity(tables[0].cells[3][1].get_text(), "Руководитель (ФИО, телефон,\nфакс, электронный адрес)")) def test_table_recognition_2(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table4.jpg"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) self.assertEqual(cnt_rows, 5) self.assertEqual(cnt_columns, 3) self.assertEqual(cnt_a_cell, 3) self.assertEqual(cnt_cell, 15) - self.assertTrue(similarity(tables[0].matrix_cells[0][1].get_text(), "Перечень основных данных и\nтребований")) - self.assertTrue(similarity(tables[0].matrix_cells[0][2].get_text(), "Основные данные и требования")) - self.assertTrue(similarity(tables[0].matrix_cells[3][1].get_text(), "Количество")) - self.assertTrue(similarity(tables[0].matrix_cells[4][1].get_text(), "Технические параметры оборудования")) + self.assertTrue(similarity(tables[0].cells[0][1].get_text(), "Перечень основных данных и\nтребований")) + self.assertTrue(similarity(tables[0].cells[0][2].get_text(), "Основные данные и требования")) + self.assertTrue(similarity(tables[0].cells[3][1].get_text(), "Количество")) + self.assertTrue(similarity(tables[0].cells[4][1].get_text(), "Технические параметры оборудования")) def test_table_recognition_3(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table5.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) self.assertEqual(cnt_rows, 13) self.assertEqual(cnt_columns, 3) self.assertEqual(cnt_a_cell, 3) self.assertEqual(cnt_cell, 39) - self.assertTrue(similarity(tables[0].matrix_cells[0][1].get_text(), "Техническая характеристика")) - self.assertTrue(similarity(tables[0].matrix_cells[0][2].get_text(), "Показатель")) - self.assertTrue(similarity(tables[0].matrix_cells[6][1].get_text(), "Использование крана и его механизмов")) - self.assertTrue(similarity(tables[0].matrix_cells[7][1].get_text(), "Тип привода:")) + self.assertTrue(similarity(tables[0].cells[0][1].get_text(), "Техническая характеристика")) + self.assertTrue(similarity(tables[0].cells[0][2].get_text(), "Показатель")) + self.assertTrue(similarity(tables[0].cells[6][1].get_text(), "Использование крана и его механизмов")) + self.assertTrue(similarity(tables[0].cells[7][1].get_text(), "Тип привода:")) def test_table_recognition_4(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table5.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) self.assertEqual(cnt_rows, 13) self.assertEqual(cnt_columns, 3) self.assertEqual(cnt_a_cell, 3) self.assertEqual(cnt_cell, 39) - self.assertTrue(similarity(tables[0].matrix_cells[0][1].get_text(), "Техническая характеристика")) - self.assertTrue(similarity(tables[0].matrix_cells[0][2].get_text(), "Показатель")) - self.assertTrue(similarity(tables[0].matrix_cells[6][1].get_text(), "Использование крана и его механизмов")) - self.assertTrue(similarity(tables[0].matrix_cells[7][1].get_text(), "Тип привода:")) + self.assertTrue(similarity(tables[0].cells[0][1].get_text(), "Техническая характеристика")) + self.assertTrue(similarity(tables[0].cells[0][2].get_text(), "Показатель")) + self.assertTrue(similarity(tables[0].cells[6][1].get_text(), "Использование крана и его механизмов")) + self.assertTrue(similarity(tables[0].cells[7][1].get_text(), "Тип привода:")) def test_table_recognition_with_rotate_5(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table6.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) self.assertEqual(cnt_rows, 3) self.assertEqual(cnt_columns, 7) self.assertEqual(cnt_a_cell, 7) self.assertEqual(cnt_cell, 21) - self.assertTrue(similarity(tables[0].matrix_cells[0][1].get_text(), "Группа")) - self.assertTrue(similarity(tables[0].matrix_cells[0][3].get_text(), "Наименование")) - self.assertTrue(similarity(tables[0].matrix_cells[2][2].get_text(), "Новая\nпозиция")) - self.assertTrue(similarity(tables[0].matrix_cells[2][5].get_text(), "3 (три)\nшт.")) + self.assertTrue(similarity(tables[0].cells[0][1].get_text(), "Группа")) + self.assertTrue(similarity(tables[0].cells[0][3].get_text(), "Наименование")) + self.assertTrue(similarity(tables[0].cells[2][2].get_text(), "Новая\nпозиция")) + self.assertTrue(similarity(tables[0].cells[2][5].get_text(), "3 (три)\nшт.")) From 2a3d0e6b6f1b0aea5b233d565d3784babe307eb4 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Thu, 12 Dec 2024 15:19:33 +0300 Subject: [PATCH 4/8] TLDR-861 remove orient cell params --- dedoc/api/api_args.py | 3 --- dedoc/api/web/index.html | 10 +--------- .../pdf_reader/data_classes/tables/scantable.py | 3 --- dedoc/readers/pdf_reader/pdf_base_reader.py | 4 ++-- .../pdf_txtlayer_reader/pdf_tabby_reader.py | 4 +--- dedoc/utils/parameter_utils.py | 17 ----------------- 6 files changed, 4 insertions(+), 37 deletions(-) diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index d1f7d5cf..f2b9e7c4 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -22,9 +22,6 @@ class QueryParameters: # tables handling need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf") table_type: str = Form("", description="Pipeline mode for table recognition") - orient_analysis_cells: str = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers") - orient_cell_angle: str = Form("90", enum=["90", "270"], - description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation') # pdf handling pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"], diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index ede62117..c68963b6 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -101,21 +101,13 @@

Attachments handling

Tables handling

-
need_pdf_table_analysis, orient_analysis_cells, orient_cell_angle +
need_pdf_table_analysis

- -

- -

- -

- -

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py index e8010886..fa60aaeb 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py @@ -33,9 +33,6 @@ def check_on_cell_instance(self) -> bool: return False return True - def to_table(self) -> Table: - return super() - @staticmethod def get_cells_text(cells: List[List[CellWithMeta]]) -> List[List[str]]: return [[cell.get_text() for cell in row] for row in cells] diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 60ccd865..41e2990f 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -87,12 +87,12 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure ) lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse) - tables = [scan_table.to_table() for scan_table in scan_tables] + # tables = [scan_table.to_table() for scan_table in scan_tables] if params_for_parse.with_attachments and self.attachment_extractor.can_extract(file_path): attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters) - result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=metadata) + result = UnstructuredDocument(lines=lines, tables=scan_tables, attachments=attachments, warnings=warnings, metadata=metadata) return self._postprocess(result) def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index c927ab0e..ef47db28 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -136,9 +136,7 @@ def __extract(self, path: str, parameters: dict, warnings: List[str], tmp_dir: s mp_tables = self.table_recognizer.convert_to_multipages_tables(all_scan_tables, lines_with_meta=all_lines) all_lines = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=all_attached_images) - tables = [scan_table.to_table() for scan_table in mp_tables] - - return all_lines, tables, all_attached_images, document_metadata + return all_lines, mp_tables, all_attached_images, document_metadata def __save_gost_frame_boxes_to_json(self, first_page: Optional[int], last_page: Optional[int], page_count: int, path: str, tmp_dir: str) -> str: from joblib import Parallel, delayed diff --git a/dedoc/utils/parameter_utils.py b/dedoc/utils/parameter_utils.py index 3df9f6ca..993b6b8a 100644 --- a/dedoc/utils/parameter_utils.py +++ b/dedoc/utils/parameter_utils.py @@ -33,13 +33,6 @@ def get_param_document_type(parameters: Optional[dict]) -> str: return document_type -def get_param_orient_analysis_cells(parameters: Optional[dict]) -> bool: - if parameters is None: - return False - orient_analysis_cells = str(parameters.get("orient_analysis_cells", "False")).lower() == "true" - return orient_analysis_cells - - def get_param_with_attachments(parameters: Optional[dict]) -> bool: if parameters is None: return False @@ -80,16 +73,6 @@ def get_param_need_binarization(parameters: Optional[dict]) -> bool: return need_binarization -def get_param_orient_cell_angle(parameters: Optional[dict]) -> int: - if parameters is None: - return 90 - - orient_cell_angle = str(parameters.get("orient_cell_angle", "90")) - if orient_cell_angle == "": - orient_cell_angle = "90" - return int(orient_cell_angle) - - def get_param_is_one_column_document(parameters: Optional[dict]) -> Optional[bool]: if parameters is None: return None From d5b1cc0d4cae36338c728402f1fdffc84214fd08 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Fri, 13 Dec 2024 18:44:02 +0300 Subject: [PATCH 5/8] TLDR-861 added BBox to Cell; rename TableHeaderExtractor; refactor table recognizer; added tests --- .../pdf_reader/data_classes/tables/cell.py | 60 ++----- .../table_recognizer/cell_splitter.py | 47 ++--- .../split_last_hor_union_cells.py | 14 +- .../multipage_table_extractor.py | 16 +- .../onepage_table_extractor.py | 29 ++-- .../table_attribute_extractor.py | 92 +++++----- .../table_recognizer/table_recognizer.py | 6 +- .../table_utils/accuracy_table_rec.py | 14 +- .../table_recognizer/table_utils/utils.py | 21 +-- .../pdf_txtlayer_reader/pdf_tabby_reader.py | 29 ++-- .../test_api_format_pdf_tabby_reader.py | 4 +- tests/unit_tests/test_module_cell_splitter.py | 164 +++++++++--------- 12 files changed, 220 insertions(+), 276 deletions(-) diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py index effd58c0..b2b28bf2 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/cell.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/cell.py @@ -1,3 +1,4 @@ +import copy from typing import List, Optional from dedocutils.data_structures import BBox @@ -9,64 +10,33 @@ class Cell(CellWithMeta): @staticmethod - def copy_from(cell: "Cell", - x_top_left: Optional[int] = None, - x_bottom_right: Optional[int] = None, - y_top_left: Optional[int] = None, - y_bottom_right: Optional[int] = None) -> "Cell": - x_top_left = cell.x_top_left if x_top_left is None else x_top_left - x_bottom_right = cell.x_bottom_right if x_bottom_right is None else x_bottom_right - y_top_left = cell.y_top_left if y_top_left is None else y_top_left - y_bottom_right = cell.y_bottom_right if y_bottom_right is None else y_bottom_right - - # TODO change x_top_left ... y_bottom_right to BBox - - return Cell(x_top_left=x_top_left, - x_bottom_right=x_bottom_right, - y_top_left=y_top_left, - y_bottom_right=y_bottom_right, - id_con=cell.id_con, - lines=cell.lines, - colspan=cell.colspan, - rowspan=cell.rowspan, - invisible=cell.invisible, - is_attribute=cell.is_attribute, - is_attribute_required=cell.is_attribute_required, - rotated_angle=cell.rotated_angle, - uid=cell.uuid, - contour_coord=cell.con_coord) + def copy_from(cell: "Cell", bbox: Optional[BBox] = None) -> "Cell": + copy_cell = copy.deepcopy(cell) + if bbox: + copy_cell.bbox = bbox + + return copy_cell def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None: if self.lines: for line in self.lines: line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) - self.x_top_left += shift_x - self.x_bottom_right += shift_x - self.y_top_left += shift_y - self.y_bottom_right += shift_y + + self.bbox.shift(shift_x=shift_x, shift_y=shift_y) if self.con_coord: self.con_coord.shift(shift_x=shift_x, shift_y=shift_y) - def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None, + def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None, is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = Optional[None], contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None: import uuid - assert x_top_left <= x_bottom_right - assert y_top_left <= y_bottom_right - self.lines = [] if lines is None else lines super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible) - # TODO change to BBox - self.x_top_left = x_top_left - self.x_bottom_right = x_bottom_right - self.y_top_left = y_top_left - self.y_bottom_right = y_bottom_right - + self.bbox = bbox self.id_con = id_con - self.is_attribute = is_attribute self.is_attribute_required = is_attribute_required self.rotated_angle = rotated_angle @@ -96,11 +66,3 @@ def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_hei def __repr__(self) -> str: return self.__str__() - - @property - def width(self) -> int: - return self.x_bottom_right - self.x_top_left - - @property - def height(self) -> int: - return self.y_bottom_right - self.y_top_left diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py index 0e72128c..ab1c355d 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py @@ -1,6 +1,7 @@ from typing import Dict, List, Optional, Tuple import numpy as np +from dedocutils.data_structures import BBox from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.utils.utils import flatten @@ -55,25 +56,26 @@ def split(self, cells: List[List[Cell]]) -> List[List[Cell]]: for row_id, row in enumerate(result_matrix): for col_id, cell in enumerate(row): if cell is None: - result_matrix[row_id][col_id] = Cell(x_top_left=horizontal_borders[row_id], - x_bottom_right=horizontal_borders[row_id + 1], - y_top_left=vertical_borders[col_id], - y_bottom_right=vertical_borders[col_id + 1]) + bbox = BBox(x_top_left=int(horizontal_borders[row_id]), + y_top_left=int(vertical_borders[col_id]), + width=int(horizontal_borders[row_id + 1] - horizontal_borders[row_id]), + height=int(vertical_borders[col_id + 1] - vertical_borders[col_id])) + result_matrix[row_id][col_id] = Cell(bbox=bbox) return result_matrix @staticmethod def __split_one_cell(cell: Cell, horizontal_borders: np.ndarray, vertical_borders: np.ndarray, result_matrix: List[List[Cell]]) -> None: - left_id, right_id = np.searchsorted(vertical_borders, [cell.x_top_left, cell.x_bottom_right]) - top_id, bottom_id = np.searchsorted(horizontal_borders, [cell.y_top_left, cell.y_bottom_right]) + left_id, right_id = np.searchsorted(vertical_borders, [cell.bbox.x_top_left, cell.bbox.x_bottom_right]) + top_id, bottom_id = np.searchsorted(horizontal_borders, [cell.bbox.y_top_left, cell.bbox.y_bottom_right]) colspan = right_id - left_id rowspan = bottom_id - top_id for row_id in range(top_id, bottom_id): for column_id in range(left_id, right_id): - new_cell = Cell.copy_from(cell, - x_top_left=vertical_borders[column_id], - x_bottom_right=vertical_borders[column_id + 1], - y_top_left=horizontal_borders[row_id], - y_bottom_right=horizontal_borders[row_id + 1]) + bbox = BBox(x_top_left=int(vertical_borders[column_id]), + y_top_left=int(horizontal_borders[row_id]), + width=int(vertical_borders[column_id + 1] - vertical_borders[column_id]), + height=int(horizontal_borders[row_id + 1] - horizontal_borders[row_id])) + new_cell = Cell.copy_from(cell, bbox) new_cell.invisible = True result_matrix[row_id][column_id] = new_cell @@ -106,20 +108,21 @@ def _merge_close_borders(self, cells: List[List[Cell]]) -> List[List[Cell]]: @return: cells with merged borders """ horizontal_borders, vertical_borders = self.__get_borders(cells) - eps_vertical = self.eps * min((cell.width for cell in flatten(cells)), default=0) - eps_horizontal = self.eps * min((cell.height for cell in flatten(cells)), default=0) + eps_vertical = self.eps * min((cell.bbox.width for cell in flatten(cells)), default=0) + eps_horizontal = self.eps * min((cell.bbox.height for cell in flatten(cells)), default=0) horizontal_dict = self.__get_border_dict(borders=horizontal_borders, threshold=eps_horizontal) vertical_dict = self.__get_border_dict(borders=vertical_borders, threshold=eps_vertical) result = [] for row in cells: new_row = [] for cell in row: - x_top_left = vertical_dict[cell.x_top_left] - x_bottom_right = vertical_dict[cell.x_bottom_right] - y_top_left = horizontal_dict[cell.y_top_left] - y_bottom_right = horizontal_dict[cell.y_bottom_right] + x_top_left = vertical_dict[cell.bbox.x_top_left] + x_bottom_right = vertical_dict[cell.bbox.x_bottom_right] + y_top_left = horizontal_dict[cell.bbox.y_top_left] + y_bottom_right = horizontal_dict[cell.bbox.y_bottom_right] if y_top_left < y_bottom_right and x_top_left < x_bottom_right: - new_cell = Cell.copy_from(cell, x_top_left=x_top_left, x_bottom_right=x_bottom_right, y_top_left=y_top_left, y_bottom_right=y_bottom_right) + bbox = BBox(x_top_left=x_top_left, y_top_left=y_top_left, width=x_bottom_right - x_top_left, height=y_bottom_right - y_top_left) + new_cell = Cell.copy_from(cell, bbox) new_row.append(new_cell) result.append(new_row) return result @@ -130,8 +133,8 @@ def __get_borders(cells: List[List[Cell]]) -> Tuple[List[int], List[int]]: vertical_borders = [] for row in cells: for cell in row: - horizontal_borders.append(cell.y_top_left) - horizontal_borders.append(cell.y_bottom_right) - vertical_borders.append(cell.x_top_left) - vertical_borders.append(cell.x_bottom_right) + horizontal_borders.append(cell.bbox.y_top_left) + horizontal_borders.append(cell.bbox.y_bottom_right) + vertical_borders.append(cell.bbox.x_top_left) + vertical_borders.append(cell.bbox.x_bottom_right) return horizontal_borders, vertical_borders diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py index 0b14f034..e80769e0 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py @@ -127,8 +127,8 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image # Get width of all union cell eps = len(union_cell) - x_left = union_cell[0].x_top_left + eps - x_right = union_cell[-1].x_bottom_right + x_left = union_cell[0].bbox.x_top_left + eps + x_right = union_cell[-1].bbox.x_bottom_right # get y coordinate from cell before union cell y_top_split = cell_splitter.con_coord.y_top_left y_bottom_split = cell_splitter.con_coord.y_top_left + cell_splitter.con_coord.height @@ -141,8 +141,8 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image col_id = len(union_cell) - 1 result_row = copy.deepcopy(union_cell) while col_id >= 0: - union_cell[col_id].y_top_left = y_top_split - union_cell[col_id].y_bottom_right = y_bottom_split + union_cell[col_id].bbox.y_top_left = y_top_split + union_cell[col_id].bbox.height = y_bottom_split - union_cell[col_id].bbox.y_top_left cell_image, padding_value = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right]) result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image, @@ -163,10 +163,8 @@ def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarra text_line = OCRCellExtractor.get_line_with_meta("") for word in line.words: # do absolute coordinate on src_image (inside src_image) - word.bbox.y_top_left -= padding_cell_value - word.bbox.x_top_left -= padding_cell_value - word.bbox.y_top_left += cell_bbox.y_top_left - word.bbox.x_top_left += cell_bbox.x_top_left + word.bbox.shift(shift_x=-padding_cell_value, shift_y=-padding_cell_value) + word.bbox.shift(shift_x=cell_bbox.x_top_left, shift_y=cell_bbox.y_top_left) # add space between words if len(text_line) != 0: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py index 5cff352d..8d74829d 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py @@ -7,7 +7,7 @@ from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableHeaderExtractor from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import equal_with_eps @@ -117,12 +117,12 @@ def __get_width_cell_wo_separating(row: List[Cell]) -> List[int]: end = None for cell_id, cell in enumerate(row): if prev_uid is None: - start = cell.x_top_left + start = cell.bbox.x_top_left prev_uid = cell.uuid elif prev_uid != cell.uuid: widths.append(end - start) - start = cell.x_top_left - end = cell.x_bottom_right + start = cell.bbox.x_top_left + end = cell.bbox.x_bottom_right if cell_id == len(row) - 1: widths.append(end - start) return widths @@ -154,16 +154,16 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool: return False # condition 2. Exclusion of the duplicated header (if any) - attr1 = TableAttributeExtractor.get_header_table(t1.cells) - attr2 = TableAttributeExtractor.get_header_table(t2.cells) + attr1 = TableHeaderExtractor.get_header_table(t1.cells) + attr2 = TableHeaderExtractor.get_header_table(t2.cells) t2_update = copy.deepcopy(t2) - if TableAttributeExtractor.is_equal_attributes(attr1, attr2): + if TableHeaderExtractor.is_equal_header(attr1, attr2): t2_update.cells = t2_update.cells[len(attr2):] if len(t2_update.cells) == 0 or len(t1.cells) == 0: return False - TableAttributeExtractor.clear_attributes(t2_update.cells) + TableHeaderExtractor.clear_attributes(t2_update.cells) # condition 3. Number of columns should be equal if len(t1.cells[-1]) != len(t2_update.cells[0]): diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py index 2a05a03c..f345f1e3 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py @@ -12,7 +12,7 @@ from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.cell_splitter import CellSplitter from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.split_last_hor_union_cells import split_last_column from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableHeaderExtractor from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_tables_by_contours @@ -23,7 +23,7 @@ def __init__(self, *, config: dict, logger: logging.Logger) -> None: self.image = None self.page_number = 0 - self.attribute_selector = TableAttributeExtractor(logger=self.logger) + self.table_header_selector = TableHeaderExtractor(logger=self.logger) self.count_vertical_extended = 0 self.splitter = CellSplitter() self.table_options = TableTypeAdditionalOptions() @@ -50,17 +50,6 @@ def extract_onepage_tables_from_image(self, image: np.ndarray, page_number: int, location.bbox.rotate_coordinates(angle_rotate=-angle_rotate, image_shape=image.shape) location.rotated_angle = angle_rotate - tables = self.__select_attributes_tables(tables=tables) - - return tables - - def __select_attributes_tables(self, tables: List[ScanTable]) -> List[ScanTable]: - for table in tables: - table = self.attribute_selector.set_attributes(table) - - if self.config.get("debug_mode", False): - self._print_table_attr(table.cells) - return tables def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable: @@ -71,15 +60,12 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable: matrix = [] line = [] for cell in table_tree.children: - if len(line) != 0 and abs(cell.cell_box.y_top_left - line[-1].y_top_left) > 15: # add eps + if len(line) != 0 and abs(cell.cell_box.y_top_left - line[-1].bbox.y_top_left) > 15: # add eps cpy_line = copy.deepcopy(line) matrix.append(cpy_line) line.clear() - cell_ = Cell(x_top_left=cell.cell_box.x_top_left, - x_bottom_right=cell.cell_box.x_bottom_right, - y_top_left=cell.cell_box.y_top_left, - y_bottom_right=cell.cell_box.y_bottom_right, + cell_ = Cell(bbox=cell.cell_box, id_con=cell.id_contours, lines=cell.lines, contour_coord=cell.cell_box) @@ -88,7 +74,7 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable: # sorting column in each row for i in range(0, len(matrix)): - matrix[i] = sorted(matrix[i], key=lambda cell: cell.x_top_left, reverse=False) + matrix[i] = sorted(matrix[i], key=lambda cell: cell.bbox.x_top_left, reverse=False) matrix_table = ScanTable(cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number) @@ -125,4 +111,9 @@ def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[Li if self.table_options.split_last_column in table_type: cells = split_last_column(cells, language=self.language, image=self.image) + self.table_header_selector.set_header_cells(cells) + + if self.config.get("debug_mode", False): + self._print_table_attr(cells) + return cells diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py index fbca8cd0..e25dbd2d 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py @@ -2,31 +2,31 @@ from typing import List from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell -from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import similarity -class TableAttributeExtractor: +class TableHeaderExtractor: """ - Class finds and labels "is_attributes=True" attribute cells into ScanTable + Class finds and labels "is_attributes=True" attribute (header) cells into ScanTable + """ def __init__(self, logger: logging.Logger) -> None: self.logger = logger - def set_attributes(self, scan_table: ScanTable) -> ScanTable: - return self.__set_attributes_for_type_top(scan_table) + def set_header_cells(self, cells: List[List[Cell]]) -> None: + self.__set_attributes_for_type_top(cells) @staticmethod - def is_equal_attributes(attr1: List[List[Cell]], attr2: List[List[Cell]], thr_similarity: int = 0.8) -> bool: - if len(attr1) != len(attr2): + def is_equal_header(header_1: List[List[Cell]], header_2: List[List[Cell]], thr_similarity: int = 0.8) -> bool: + if len(header_1) != len(header_2): return False - for i in range(len(attr1)): - if len(attr1[i]) != len(attr2[i]): + for i in range(len(header_1)): + if len(header_1[i]) != len(header_2[i]): return False - for j in range(len(attr1[i])): - if similarity(attr1[i][j].get_text(), attr2[i][j].get_text()) < thr_similarity: + for j in range(len(header_1[i])): + if similarity(header_1[i][j].get_text(), header_2[i][j].get_text()) < thr_similarity: return False return True @@ -44,7 +44,7 @@ def check_have_attributes(matrix_table: List[List[Cell]]) -> bool: @staticmethod def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]: - if not TableAttributeExtractor.check_have_attributes(matrix_table): + if not TableHeaderExtractor.check_have_attributes(matrix_table): return matrix_table[:1] header_rows = len(matrix_table) @@ -58,7 +58,7 @@ def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]: @staticmethod def clear_attributes(matrix_table: List[List[Cell]]) -> None: - if not TableAttributeExtractor.check_have_attributes(matrix_table): + if not TableHeaderExtractor.check_have_attributes(matrix_table): return for row in matrix_table: @@ -74,15 +74,15 @@ def __is_indexable_column(self, matrix_table: List[List[Cell]], column_id: int, return True return False - def __set_attributes_for_type_top(self, scan_table: ScanTable) -> ScanTable: - vertical_union_columns = self.__analyze_attr_for_vertical_union_columns(scan_table) - horizontal_union_rows = self.__analyze_attr_for_horizontal_union_raws(scan_table) + def __set_attributes_for_type_top(self, cells: List[List[Cell]]) -> List[List[Cell]]: + vertical_union_columns = self.__analyze_attr_for_vertical_union_columns(cells) + horizontal_union_rows = self.__analyze_attr_for_horizontal_union_raws(cells) # simple table if (0 not in horizontal_union_rows) and len(vertical_union_columns) == 0: - self.__analyze_attr_for_simple_table(scan_table) + self.__analyze_attr_for_simple_table(cells) - return scan_table + return cells def __is_empty_column(self, matrix_table: List[List[Cell]], column_id: int) -> bool: all_empty = True @@ -102,72 +102,72 @@ def __is_empty_row(self, matrix_table: List[List[Cell]], row_index: int) -> bool break return all_empty - def __analyze_attr_for_vertical_union_columns(self, scan_table: ScanTable) -> List[int]: + def __analyze_attr_for_vertical_union_columns(self, cells: List[List[Cell]]) -> List[int]: vertical_union_columns = [] - if len(vertical_union_columns) != 0 and len(scan_table.cells) > 1: + if len(vertical_union_columns) != 0 and len(cells) > 1: self.logger.debug("ATTR_TYPE: vertical union table") row_max_attr = 1 - i = 1 # Установка атрибутов таблицы for i in range(0, row_max_attr): - for j in range(0, len(scan_table.cells[i])): - scan_table.cells[i][j].is_attribute = True + for j in range(0, len(cells[i])): + cells[i][j].is_attribute = True + # Установка обязательных атрибутов - scan_table.cells[0][0].is_attribute_required = True - for j in range(1, len(scan_table.cells[0])): + cells[0][0].is_attribute_required = True + for j in range(1, len(cells[0])): is_attribute_required = True if is_attribute_required: - scan_table.cells[0][j].is_attribute_required = True + cells[0][j].is_attribute_required = True return vertical_union_columns - def __analyze_attr_for_horizontal_union_raws(self, scan_table: ScanTable) -> List[int]: + def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> List[int]: horizontal_union_rows = [] union_first = False - for i in range(0, len(scan_table.cells)): + for i in range(0, len(cells)): if len(horizontal_union_rows) > 0 and i not in horizontal_union_rows: horizontal_union_rows.append(i) - if not self.__is_empty_row(scan_table.cells, i): + if not self.__is_empty_row(cells, i): break if union_first and len(horizontal_union_rows) != 0: self.logger.debug("ATTR_TYPE: horizontal_union_rows") for i in range(0, len(horizontal_union_rows)): - for j in range(0, len(scan_table.cells[i])): - scan_table.cells[i][j].is_attribute = True - scan_table.cells[0][0].is_attribute_required = True + for j in range(0, len(cells[i])): + cells[i][j].is_attribute = True + cells[0][0].is_attribute_required = True first_required_column = 0 # search indexable_column # один один столбец должен быть (0) - нумерованным, # один (1) - с обязательными поляями, один (2) - с необязательными # поэтому len(matrix_table) > first_required_column + 2 if len(horizontal_union_rows) > 0 and \ - self.__is_indexable_column(scan_table.cells, first_required_column, max_raw_of_search=horizontal_union_rows[-1]) \ - and len(scan_table.cells) > first_required_column + 2: - scan_table.cells[0][first_required_column + 1].is_attribute_required = True + self.__is_indexable_column(cells, first_required_column, max_raw_of_search=horizontal_union_rows[-1]) \ + and len(cells) > first_required_column + 2: + cells[0][first_required_column + 1].is_attribute_required = True # Полностью пустые строки не могут быть атрибутами (не информативны) # Перенос атрибутов на след строку таблицы index_empty_rows = horizontal_union_rows[-1] - if self.__is_empty_row(scan_table.cells, index_empty_rows) and len(scan_table.cells) != index_empty_rows + 1: + if self.__is_empty_row(cells, index_empty_rows) and len(cells) != index_empty_rows + 1: horizontal_union_rows.append(index_empty_rows + 1) - for j in range(0, len(scan_table.cells[index_empty_rows + 1])): - scan_table.cells[index_empty_rows + 1][j].is_attribute = True + for j in range(0, len(cells[index_empty_rows + 1])): + cells[index_empty_rows + 1][j].is_attribute = True self.logger.debug("detect empty attributes row") return horizontal_union_rows - def __analyze_attr_for_simple_table(self, scan_table: ScanTable) -> None: + def __analyze_attr_for_simple_table(self, cells: List[List[Cell]]) -> None: self.logger.debug("ATTR_TYPE: simple table") - for j in range(0, len(scan_table.cells[0])): - scan_table.cells[0][j].is_attribute = True + for j in range(0, len(cells[0])): + cells[0][j].is_attribute = True # set first required column j = 0 first_required_column = j - while j < len(scan_table.cells[0]): - if not self.__is_empty_column(scan_table.cells, j): - scan_table.cells[0][j].is_attribute_required = True + while j < len(cells[0]): + if not self.__is_empty_column(cells, j): + cells[0][j].is_attribute_required = True first_required_column = j break j += 1 @@ -175,5 +175,5 @@ def __analyze_attr_for_simple_table(self, scan_table: ScanTable) -> None: # один один столбец должен быть (0) - нумерованным, # один (1) - с обязательными поляями, один (2) - с необязательными # поэтому len(matrix_table) > first_required_column + 2 - if self.__is_indexable_column(scan_table.cells, first_required_column, 0) and len(scan_table.cells) > first_required_column + 2: - scan_table.cells[0][first_required_column + 1].is_attribute_required = True + if self.__is_indexable_column(cells, first_required_column, 0) and len(cells) > first_required_column + 2: + cells[0][first_required_column + 1].is_attribute_required = True diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py index 3d2f89ea..eb07732d 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py @@ -36,8 +36,8 @@ def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines def recognize_tables_from_image(self, image: np.ndarray, page_number: int, language: str, table_type: str = "") -> Tuple[np.ndarray, List[ScanTable]]: self.logger.debug(f"Page {page_number}") try: - cleaned_image, matrix_tables = self.__rec_tables_from_img(image, page_num=page_number, language=language, table_type=table_type) - return cleaned_image, matrix_tables + cleaned_image, scan_tables = self.__rec_tables_from_img(image, page_num=page_number, language=language, table_type=table_type) + return cleaned_image, scan_tables except Exception as ex: logging.warning(ex) if self.config.get("debug_mode", False): @@ -113,7 +113,7 @@ def __if_not_table(self, table: ScanTable, image: np.ndarray) -> bool: cells_area = 0 for row in table.cells: for cell in row: - cells_area += cell.width * cell.height + cells_area += cell.bbox.width * cell.bbox.height ratio = cells_area / table_area res = (white_mean < 0.5) or (black_mean > 0.3) or (std < 30) or (mean < 150) or (mean < 200 and std < 80) or ratio < 0.65 diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py index 4b7211b6..c98d71a5 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py @@ -4,6 +4,7 @@ from typing import List, Tuple import cv2 +from dedocutils.data_structures import BBox from dedoc.config import get_config from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell @@ -12,7 +13,7 @@ def _create_cell(c: str, text_cells: list) -> Cell: - cell = Cell(x_bottom_right=-1, x_top_left=-1, y_top_left=-1, y_bottom_right=-1) + cell = Cell(BBox(x_top_left=-1, y_top_left=-1, width=0, height=0)) if "a" in c: cell.is_attribute = True # loading cell text @@ -81,8 +82,15 @@ def draw_recognized_cell(tables: List[ScanTable], path_image: str, path_save: st cv2.rectangle(img, (bbox.x_top_left, bbox.y_top_left), (bbox.width, bbox.height), blue_color, 6) for i in range(0, len(table)): for j in range(0, len(table[i])): - cv2.rectangle(img, (table[i][j].x_top_left, table[i][j].y_top_left), (table[i][j].x_bottom_right, table[i][j].y_bottom_right), red_color, 4) - cv2.putText(img, str(table[i][j].id_con), (table[i][j].x_top_left, table[i][j].y_bottom_right), cv2.FONT_HERSHEY_PLAIN, 4, green_color) + cv2.rectangle(img, + (table[i][j].bbox.x_top_left, table[i][j].bbox.y_top_left), + (table[i][j].bbox.x_bottom_right, table[i][j].bbox.y_bottom_right), + red_color, 4 + ) + cv2.putText(img, str(table[i][j].id_con), + (table[i][j].bbox.x_top_left, table[i][j].bbox.y_bottom_right), + cv2.FONT_HERSHEY_PLAIN, 4, green_color + ) cv2.imwrite(path_save, img) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py index 19674772..80ac01e7 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py @@ -1,5 +1,3 @@ -import difflib - import numpy as np @@ -20,24 +18,9 @@ def get_highest_pixel_frequency(image: np.ndarray) -> int: def similarity(s1: str, s2: str) -> float: """string similarity""" + import difflib + normalized1 = s1.lower() normalized2 = s2.lower() matcher = difflib.SequenceMatcher(None, normalized1, normalized2) return matcher.ratio() - - -MINIMAL_CELL_CNT_LINE = 7 -MINIMAL_CELL_AVG_LENGTH_LINE = 10 - - -def detect_diff_orient(cell_text: str) -> bool: - # 1 - разбиваем на строки длины которых состоят хотя бы из одного символа - parts = cell_text.split("\n") - parts = [p for p in parts if len(p) > 0] - - # 2 - подсчитываем среднюю длину строк ячейки - len_parts = [len(p) for p in parts] - avg_len_part = np.average(len_parts) - - # Эвристика: считаем сто ячейка повернута если у нас большое количество строк и строки короткие - return len(parts) > MINIMAL_CELL_CNT_LINE and avg_len_part < MINIMAL_CELL_AVG_LENGTH_LINE diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index ef47db28..cce14d01 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -15,7 +15,7 @@ from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableHeaderExtractor class PdfTabbyReader(PdfBaseReader): @@ -38,7 +38,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.jar_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "tabbypdf", "jars")) self.java_not_found_error = "`java` command is not found from this Python process. Please ensure Java is installed and PATH is set for `java`" self.default_config = {"JAR_PATH": os.path.join(self.jar_dir, self.jar_name)} - self.attribute_selector = TableAttributeExtractor(logger=self.logger) + self.table_header_selector = TableHeaderExtractor(logger=self.logger) self.table_extractor = OnePageTableExtractor(config=config, logger=self.logger) def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: @@ -171,7 +171,7 @@ def __get_tables(self, page: dict) -> List[ScanTable]: for table in page["tables"]: table_bbox = BBox(x_top_left=table["x_top_left"], y_top_left=table["y_top_left"], width=table["width"], height=table["height"]) - order = table["order"] # TODO add table order into TableMetadata + order = table["order"] rows = table["rows"] cell_properties = table["cell_properties"] assert len(rows) == len(cell_properties) @@ -188,30 +188,27 @@ def __get_tables(self, page: dict) -> List[ScanTable]: for c in cell_blocks: cell_bbox = BBox(x_top_left=int(c["x_top_left"]), y_top_left=int(c["y_top_left"]), width=int(c["width"]), height=int(c["height"])) annotations.append(BBoxAnnotation(c["start"], c["end"], cell_bbox, page_width=page_width, page_height=page_height)) - """ - TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable" - https://jira.intra.ispras.ru/browse/TLDR-851 - """ + current_cell_properties = cell_properties[num_row][num_col] + bbox = BBox(x_top_left=int(current_cell_properties["x_top_left"]), + y_top_left=int(current_cell_properties["y_top_left"]), + width=int(current_cell_properties["width"]), + height=int(current_cell_properties["height"])) + result_row.append(Cell( + bbox=bbox, lines=[LineWithMeta(line=cell["text"], metadata=LineMetadata(page_id=page_number, line_id=0), annotations=annotations)], colspan=current_cell_properties["col_span"], rowspan=current_cell_properties["row_span"], - invisible=bool(current_cell_properties["invisible"]), - x_top_left=int(current_cell_properties["x_top_left"]), - x_bottom_right=int(current_cell_properties["x_top_left"]) + int(current_cell_properties["width"]), - y_top_left=int(current_cell_properties["y_top_left"]), - y_bottom_right=int(current_cell_properties["y_top_left"]) + int(current_cell_properties["height"]) + invisible=bool(current_cell_properties["invisible"]) )) cells.append(result_row) try: cells = self.table_extractor.handle_cells(cells) - table = ScanTable(page_number=page_number, cells=cells, bbox=table_bbox, order=order) - table = self.attribute_selector.set_attributes(table) - scan_tables.append(table) + scan_tables.append(ScanTable(page_number=page_number, cells=cells, bbox=table_bbox, order=order)) except Exception as ex: - self.logger.warning(f"Warning: unrecognized table into page {self.page_number}. {ex}") + self.logger.warning(f"Warning: unrecognized table on page {self.page_number}. {ex}") if self.config.get("debug_mode", False): raise ex diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py index b2ff91a6..959e15ca 100644 --- a/tests/api_tests/test_api_format_pdf_tabby_reader.py +++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py @@ -182,7 +182,7 @@ def test_pdf_with_tables(self) -> None: table = tables[3]["cells"] self.assertListEqual(["", "2016", "2017", "2018", "2019"], self._get_text_of_row(table[0])) - self.assertListEqual(["", "Прогноз", "Прогноз бюджета"], self._get_text_of_row(table[1])) + self.assertListEqual(["", "Прогноз", "Прогноз бюджета", "Прогноз бюджета", "Прогноз бюджета"], self._get_text_of_row(table[1])) self.assertListEqual(["Ненефтегазов\nые доходы", "10,4", "9,6", "9,6", "9,6"], self._get_text_of_row(table[21])) self.assertListEqual(["Сальдо\nбюджета", "-3,7", "-3,2", "-2,2", "-1,2"], self._get_text_of_row(table[22])) @@ -227,7 +227,7 @@ def test_tables_with_merged_cells(self) -> None: result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby")) table = result["content"]["tables"][0]["cells"] - hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 1), 5]] + hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 5), 5]] for (i, j), k in hidden_cells_big_table_with_colspan: self.assertFalse(table[i][j]["invisible"]) diff --git a/tests/unit_tests/test_module_cell_splitter.py b/tests/unit_tests/test_module_cell_splitter.py index ad48952a..36113dbc 100644 --- a/tests/unit_tests/test_module_cell_splitter.py +++ b/tests/unit_tests/test_module_cell_splitter.py @@ -1,5 +1,7 @@ import unittest +from dedocutils.data_structures import BBox + from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.cell_splitter import CellSplitter @@ -10,42 +12,42 @@ class TestCellSplitter(unittest.TestCase): def test_merge_close_borders(self) -> None: cells = [ [ - Cell(x_top_left=0, y_top_left=0, x_bottom_right=50, y_bottom_right=30), - Cell(x_top_left=51, y_top_left=2, x_bottom_right=90, y_bottom_right=29) + Cell(BBox(x_top_left=0, y_top_left=0, width=50, height=30)), + Cell(BBox(x_top_left=51, y_top_left=2, width=39, height=27)) ], [ - Cell(x_top_left=0, y_top_left=31, x_bottom_right=50, y_bottom_right=50), - Cell(x_top_left=51, y_top_left=31, x_bottom_right=91, y_bottom_right=50) + Cell(BBox(x_top_left=0, y_top_left=31, width=50, height=19)), + Cell(BBox(x_top_left=51, y_top_left=31, width=40, height=19)) ] ] cells_merged = self.splitter._merge_close_borders(cells) - self.assertEqual(0, cells_merged[0][0].x_top_left) - self.assertEqual(0, cells_merged[0][0].y_top_left) - self.assertEqual(50, cells_merged[0][0].x_bottom_right) - self.assertEqual(29, cells_merged[0][0].y_bottom_right) - - self.assertEqual(50, cells_merged[0][1].x_top_left) - self.assertEqual(0, cells_merged[0][1].y_top_left) - self.assertEqual(90, cells_merged[0][1].x_bottom_right) - self.assertEqual(29, cells_merged[0][1].y_bottom_right) - - self.assertEqual(0, cells_merged[1][0].x_top_left) - self.assertEqual(29, cells_merged[1][0].y_top_left) - self.assertEqual(50, cells_merged[1][0].x_bottom_right) - self.assertEqual(50, cells_merged[1][0].y_bottom_right) - - self.assertEqual(50, cells_merged[1][1].x_top_left) - self.assertEqual(29, cells_merged[1][1].y_top_left) - self.assertEqual(90, cells_merged[1][1].x_bottom_right) - self.assertEqual(50, cells_merged[1][1].y_bottom_right) + self.assertEqual(0, cells_merged[0][0].bbox.x_top_left) + self.assertEqual(0, cells_merged[0][0].bbox.y_top_left) + self.assertEqual(50, cells_merged[0][0].bbox.x_bottom_right) + self.assertEqual(29, cells_merged[0][0].bbox.y_bottom_right) + + self.assertEqual(50, cells_merged[0][1].bbox.x_top_left) + self.assertEqual(0, cells_merged[0][1].bbox.y_top_left) + self.assertEqual(90, cells_merged[0][1].bbox.x_bottom_right) + self.assertEqual(29, cells_merged[0][1].bbox.y_bottom_right) + + self.assertEqual(0, cells_merged[1][0].bbox.x_top_left) + self.assertEqual(29, cells_merged[1][0].bbox.y_top_left) + self.assertEqual(50, cells_merged[1][0].bbox.x_bottom_right) + self.assertEqual(50, cells_merged[1][0].bbox.y_bottom_right) + + self.assertEqual(50, cells_merged[1][1].bbox.x_top_left) + self.assertEqual(29, cells_merged[1][1].bbox.y_top_left) + self.assertEqual(90, cells_merged[1][1].bbox.x_bottom_right) + self.assertEqual(50, cells_merged[1][1].bbox.y_bottom_right) def test_merge_close_borders_one_cell(self) -> None: - cells = [[Cell(x_top_left=0, y_top_left=0, x_bottom_right=50, y_bottom_right=30)]] + cells = [[Cell(BBox(x_top_left=0, y_top_left=0, width=50, height=30))]] cells_merged = self.splitter._merge_close_borders(cells) - self.assertEqual(0, cells_merged[0][0].x_top_left) - self.assertEqual(0, cells_merged[0][0].y_top_left) - self.assertEqual(50, cells_merged[0][0].x_bottom_right) - self.assertEqual(30, cells_merged[0][0].y_bottom_right) + self.assertEqual(0, cells_merged[0][0].bbox.x_top_left) + self.assertEqual(0, cells_merged[0][0].bbox.y_top_left) + self.assertEqual(50, cells_merged[0][0].bbox.x_bottom_right) + self.assertEqual(30, cells_merged[0][0].bbox.y_bottom_right) def test_merge_zero_cells(self) -> None: cells = [[]] @@ -58,24 +60,24 @@ def test_split_zero_cells(self) -> None: self.assertListEqual([[]], matrix) def test_split_one_cell(self) -> None: - cells = [[Cell(x_top_left=0, y_top_left=0, x_bottom_right=10, y_bottom_right=15)]] + cells = [[Cell(BBox(x_top_left=0, y_top_left=0, width=10, height=15))]] matrix = self.splitter.split(cells=cells) self.assertEqual(1, len(matrix)) self.assertEqual(1, len(matrix[0])) new_cell = matrix[0][0] - self.assertEqual(0, new_cell.x_top_left) - self.assertEqual(0, new_cell.y_top_left) - self.assertEqual(10, new_cell.x_bottom_right) - self.assertEqual(15, new_cell.y_bottom_right) + self.assertEqual(0, new_cell.bbox.x_top_left) + self.assertEqual(0, new_cell.bbox.y_top_left) + self.assertEqual(10, new_cell.bbox.x_bottom_right) + self.assertEqual(15, new_cell.bbox.y_bottom_right) def test_horizontal_split(self) -> None: cells = [ [ - Cell(x_top_left=0, y_top_left=0, x_bottom_right=3, y_bottom_right=5), - Cell(x_top_left=3, y_top_left=0, x_bottom_right=7, y_bottom_right=3), + Cell(BBox(x_top_left=0, y_top_left=0, width=3, height=5)), + Cell(BBox(x_top_left=3, y_top_left=0, width=4, height=3)), ], [ - Cell(x_top_left=3, y_top_left=3, x_bottom_right=7, y_bottom_right=5), + Cell(BBox(x_top_left=3, y_top_left=3, width=4, height=2)), ] ] matrix = self.splitter.split(cells) @@ -83,34 +85,34 @@ def test_horizontal_split(self) -> None: self.assertEqual(2, len(matrix[0])) self.assertEqual(2, len(matrix[1])) [cell_a, cell_b], [cell_c, cell_d] = matrix - self.assertEqual(0, cell_a.x_top_left) - self.assertEqual(0, cell_a.y_top_left) - self.assertEqual(3, cell_a.x_bottom_right) - self.assertEqual(3, cell_a.y_bottom_right) - - self.assertEqual(3, cell_b.x_top_left) - self.assertEqual(0, cell_b.y_top_left) - self.assertEqual(7, cell_b.x_bottom_right) - self.assertEqual(3, cell_b.y_bottom_right) - - self.assertEqual(0, cell_c.x_top_left) - self.assertEqual(3, cell_c.y_top_left) - self.assertEqual(3, cell_c.x_bottom_right) - self.assertEqual(5, cell_c.y_bottom_right) - - self.assertEqual(3, cell_d.x_top_left) - self.assertEqual(3, cell_d.y_top_left) - self.assertEqual(7, cell_d.x_bottom_right) - self.assertEqual(5, cell_d.y_bottom_right) + self.assertEqual(0, cell_a.bbox.x_top_left) + self.assertEqual(0, cell_a.bbox.y_top_left) + self.assertEqual(3, cell_a.bbox.x_bottom_right) + self.assertEqual(3, cell_a.bbox.y_bottom_right) + + self.assertEqual(3, cell_b.bbox.x_top_left) + self.assertEqual(0, cell_b.bbox.y_top_left) + self.assertEqual(7, cell_b.bbox.x_bottom_right) + self.assertEqual(3, cell_b.bbox.y_bottom_right) + + self.assertEqual(0, cell_c.bbox.x_top_left) + self.assertEqual(3, cell_c.bbox.y_top_left) + self.assertEqual(3, cell_c.bbox.x_bottom_right) + self.assertEqual(5, cell_c.bbox.y_bottom_right) + + self.assertEqual(3, cell_d.bbox.x_top_left) + self.assertEqual(3, cell_d.bbox.y_top_left) + self.assertEqual(7, cell_d.bbox.x_bottom_right) + self.assertEqual(5, cell_d.bbox.y_bottom_right) def test_vertical_split(self) -> None: cells = [ [ - Cell(x_top_left=0, y_top_left=0, x_bottom_right=8, y_bottom_right=2), + Cell(BBox(x_top_left=0, y_top_left=0, width=8, height=2)), ], [ - Cell(x_top_left=0, y_top_left=2, x_bottom_right=5, y_bottom_right=5), - Cell(x_top_left=5, y_top_left=2, x_bottom_right=8, y_bottom_right=5), + Cell(BBox(x_top_left=0, y_top_left=2, width=5, height=3)), + Cell(BBox(x_top_left=5, y_top_left=2, width=3, height=3)), ] ] matrix = self.splitter.split(cells) @@ -118,35 +120,35 @@ def test_vertical_split(self) -> None: self.assertEqual(2, len(matrix[0])) self.assertEqual(2, len(matrix[1])) [cell_a, cell_b], [cell_c, cell_d] = matrix - self.assertEqual(0, cell_a.x_top_left) - self.assertEqual(0, cell_a.y_top_left) - self.assertEqual(5, cell_a.x_bottom_right) - self.assertEqual(2, cell_a.y_bottom_right) - - self.assertEqual(5, cell_b.x_top_left) - self.assertEqual(0, cell_b.y_top_left) - self.assertEqual(8, cell_b.x_bottom_right) - self.assertEqual(2, cell_b.y_bottom_right) - - self.assertEqual(0, cell_c.x_top_left) - self.assertEqual(2, cell_c.y_top_left) - self.assertEqual(5, cell_c.x_bottom_right) - self.assertEqual(5, cell_c.y_bottom_right) - - self.assertEqual(5, cell_d.x_top_left) - self.assertEqual(2, cell_d.y_top_left) - self.assertEqual(8, cell_d.x_bottom_right) - self.assertEqual(5, cell_d.y_bottom_right) + self.assertEqual(0, cell_a.bbox.x_top_left) + self.assertEqual(0, cell_a.bbox.y_top_left) + self.assertEqual(5, cell_a.bbox.x_bottom_right) + self.assertEqual(2, cell_a.bbox.y_bottom_right) + + self.assertEqual(5, cell_b.bbox.x_top_left) + self.assertEqual(0, cell_b.bbox.y_top_left) + self.assertEqual(8, cell_b.bbox.x_bottom_right) + self.assertEqual(2, cell_b.bbox.y_bottom_right) + + self.assertEqual(0, cell_c.bbox.x_top_left) + self.assertEqual(2, cell_c.bbox.y_top_left) + self.assertEqual(5, cell_c.bbox.x_bottom_right) + self.assertEqual(5, cell_c.bbox.y_bottom_right) + + self.assertEqual(5, cell_d.bbox.x_top_left) + self.assertEqual(2, cell_d.bbox.y_top_left) + self.assertEqual(8, cell_d.bbox.x_bottom_right) + self.assertEqual(5, cell_d.bbox.y_bottom_right) def test_no_split(self) -> None: cells = [ [ - Cell(x_top_left=160, y_top_left=321, x_bottom_right=825, y_bottom_right=369), - Cell(x_top_left=825, y_top_left=321, x_bottom_right=1494, y_bottom_right=369) + Cell(BBox(x_top_left=160, y_top_left=321, width=665, height=48)), + Cell(BBox(x_top_left=825, y_top_left=321, width=669, height=48)) ], [ - Cell(x_top_left=160, y_top_left=374, x_bottom_right=825, y_bottom_right=423), - Cell(x_top_left=825, y_top_left=374, x_bottom_right=1494, y_bottom_right=423) + Cell(BBox(x_top_left=160, y_top_left=374, width=665, height=49)), + Cell(BBox(x_top_left=825, y_top_left=374, width=669, height=49)) ] ] From 75db1a770f18d0959d288c82d83507d2da5a46ff Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Mon, 16 Dec 2024 18:17:04 +0300 Subject: [PATCH 6/8] TLDR-851 fix lint --- .../concrete_extractors/onepage_table_extractor.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py index f345f1e3..c7c59414 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py @@ -65,10 +65,7 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable: matrix.append(cpy_line) line.clear() - cell_ = Cell(bbox=cell.cell_box, - id_con=cell.id_contours, - lines=cell.lines, - contour_coord=cell.cell_box) + cell_ = Cell(bbox=cell.cell_box, id_con=cell.id_contours, lines=cell.lines, contour_coord=cell.cell_box) line.append(cell_) matrix.append(line) From 1a02c48a4c71b52c7963f118ac64441fdce9daf3 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Tue, 17 Dec 2024 19:05:09 +0300 Subject: [PATCH 7/8] TLDR-861 fixes after review --- dedoc/data_structures/cell_with_meta.py | 2 +- .../pdf_reader/data_classes/tables/cell.py | 10 +- .../data_classes/tables/scantable.py | 5 +- dedoc/readers/pdf_reader/pdf_base_reader.py | 1 - .../split_last_hor_union_cells.py | 9 +- .../onepage_table_extractor.py | 4 +- .../table_attribute_extractor.py | 51 ++---- .../table_recognizer/table_recognizer.py | 9 +- .../table_utils/accuracy_table_rec.py | 148 ------------------ .../table_recognizer/table_utils/utils.py | 14 ++ .../pdf_txtlayer_reader/pdf_tabby_reader.py | 6 +- docs/source/dedoc_api_usage/api.rst | 19 --- docs/source/parameters/pdf_handling.rst | 24 --- .../unit_tests/test_module_table_detection.py | 15 +- 14 files changed, 54 insertions(+), 263 deletions(-) delete mode 100644 dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py index 1ef652b0..03ee0c67 100644 --- a/dedoc/data_structures/cell_with_meta.py +++ b/dedoc/data_structures/cell_with_meta.py @@ -48,7 +48,7 @@ def get_annotations(self) -> List[Annotation]: return LineWithMeta.join(lines=self.lines, delimiter="\n").annotations def __str__(self) -> str: - return f"CellWithMeta((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})" + return f"CellWithMeta(cs={self.colspan}, rs={self.rowspan}, {self.get_text()})" def to_api_schema(self) -> ApiCellWithMeta: import numpy as np diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py index b2b28bf2..d83e2b6c 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/cell.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/cell.py @@ -23,16 +23,15 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) self.bbox.shift(shift_x=shift_x, shift_y=shift_y) - if self.con_coord: - self.con_coord.shift(shift_x=shift_x, shift_y=shift_y) + if self.contour_coord: + self.contour_coord.shift(shift_x=shift_x, shift_y=shift_y) def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None, - is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = Optional[None], + is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: Optional[str] = None, contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None: import uuid - self.lines = [] if lines is None else lines super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible) self.bbox = bbox @@ -40,9 +39,8 @@ def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMe self.is_attribute = is_attribute self.is_attribute_required = is_attribute_required self.rotated_angle = rotated_angle - self.uuid = uuid.uuid4() if uuid is None else uid - self.con_coord = contour_coord or BBox(0, 0, 0, 0) + self.contour_coord = contour_coord or BBox(0, 0, 0, 0) def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None: from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py index fa60aaeb..9ae91c18 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py @@ -33,8 +33,7 @@ def check_on_cell_instance(self) -> bool: return False return True - @staticmethod - def get_cells_text(cells: List[List[CellWithMeta]]) -> List[List[str]]: + def __get_cells_text(self, cells: List[List[CellWithMeta]]) -> List[List[str]]: return [[cell.get_text() for cell in row] for row in cells] @property @@ -48,7 +47,7 @@ def uid(self) -> str: def to_dict(self) -> dict: from collections import OrderedDict - data_text = ScanTable.get_cells_text(self.cells) + data_text = self.__get_cells_text(self.cells) res = OrderedDict() res["locations"] = [location.to_dict() for location in self.locations] diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 41e2990f..3a6e29ef 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -87,7 +87,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure ) lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse) - # tables = [scan_table.to_table() for scan_table in scan_tables] if params_for_parse.with_attachments and self.attachment_extractor.can_extract(file_path): attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py index e80769e0..8dd0bbac 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py @@ -130,8 +130,8 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image x_left = union_cell[0].bbox.x_top_left + eps x_right = union_cell[-1].bbox.x_bottom_right # get y coordinate from cell before union cell - y_top_split = cell_splitter.con_coord.y_top_left - y_bottom_split = cell_splitter.con_coord.y_top_left + cell_splitter.con_coord.height + y_top_split = cell_splitter.contour_coord.y_top_left + y_bottom_split = cell_splitter.contour_coord.y_top_left + cell_splitter.contour_coord.height if abs(y_bottom_split - y_top_split) < 10: for cell in union_cell: cell.lines = [] @@ -162,9 +162,8 @@ def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarra for line in list(ocr_result.lines): text_line = OCRCellExtractor.get_line_with_meta("") for word in line.words: - # do absolute coordinate on src_image (inside src_image) - word.bbox.shift(shift_x=-padding_cell_value, shift_y=-padding_cell_value) - word.bbox.shift(shift_x=cell_bbox.x_top_left, shift_y=cell_bbox.y_top_left) + # do absolute coordinates on src_image (inside src_image) + word.bbox.shift(shift_x=cell_bbox.x_top_left - padding_cell_value, shift_y=cell_bbox.y_top_left - padding_cell_value) # add space between words if len(text_line) != 0: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py index c7c59414..6271b2ac 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py @@ -23,7 +23,7 @@ def __init__(self, *, config: dict, logger: logging.Logger) -> None: self.image = None self.page_number = 0 - self.table_header_selector = TableHeaderExtractor(logger=self.logger) + self.table_header_extractor = TableHeaderExtractor(logger=self.logger) self.count_vertical_extended = 0 self.splitter = CellSplitter() self.table_options = TableTypeAdditionalOptions() @@ -108,7 +108,7 @@ def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[Li if self.table_options.split_last_column in table_type: cells = split_last_column(cells, language=self.language, image=self.image) - self.table_header_selector.set_header_cells(cells) + self.table_header_extractor.set_header_cells(cells) if self.config.get("debug_mode", False): self._print_table_attr(cells) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py index e25dbd2d..99420036 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py @@ -68,25 +68,23 @@ def clear_attributes(matrix_table: List[List[Cell]]) -> None: def __is_indexable_column(self, matrix_table: List[List[Cell]], column_id: int, max_raw_of_search: int) -> bool: # № п/п - for i in range(0, max_raw_of_search + 1): + for i in range(max_raw_of_search + 1): if column_id < len(matrix_table[i]) and "№" in matrix_table[i][column_id].get_text() and len( matrix_table[i][column_id].get_text()) < len("№ п/п\n"): return True return False def __set_attributes_for_type_top(self, cells: List[List[Cell]]) -> List[List[Cell]]: - vertical_union_columns = self.__analyze_attr_for_vertical_union_columns(cells) horizontal_union_rows = self.__analyze_attr_for_horizontal_union_raws(cells) - # simple table - if (0 not in horizontal_union_rows) and len(vertical_union_columns) == 0: + if 0 not in horizontal_union_rows: self.__analyze_attr_for_simple_table(cells) return cells def __is_empty_column(self, matrix_table: List[List[Cell]], column_id: int) -> bool: all_empty = True - for i in range(0, len(matrix_table)): + for i in range(len(matrix_table)): if len(matrix_table[i]) <= column_id: break if matrix_table[i][column_id].get_text() != "": @@ -96,37 +94,17 @@ def __is_empty_column(self, matrix_table: List[List[Cell]], column_id: int) -> b def __is_empty_row(self, matrix_table: List[List[Cell]], row_index: int) -> bool: all_empty = True - for j in range(0, len(matrix_table[row_index])): + for j in range(len(matrix_table[row_index])): if matrix_table[row_index][j].get_text() != "": all_empty = False break return all_empty - def __analyze_attr_for_vertical_union_columns(self, cells: List[List[Cell]]) -> List[int]: - vertical_union_columns = [] - if len(vertical_union_columns) != 0 and len(cells) > 1: - self.logger.debug("ATTR_TYPE: vertical union table") - row_max_attr = 1 - - # Установка атрибутов таблицы - for i in range(0, row_max_attr): - for j in range(0, len(cells[i])): - cells[i][j].is_attribute = True - - # Установка обязательных атрибутов - cells[0][0].is_attribute_required = True - for j in range(1, len(cells[0])): - is_attribute_required = True - if is_attribute_required: - cells[0][j].is_attribute_required = True - - return vertical_union_columns - def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> List[int]: horizontal_union_rows = [] union_first = False - for i in range(0, len(cells)): + for i in range(len(cells)): if len(horizontal_union_rows) > 0 and i not in horizontal_union_rows: horizontal_union_rows.append(i) if not self.__is_empty_row(cells, i): @@ -134,8 +112,8 @@ def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> L if union_first and len(horizontal_union_rows) != 0: self.logger.debug("ATTR_TYPE: horizontal_union_rows") - for i in range(0, len(horizontal_union_rows)): - for j in range(0, len(cells[i])): + for i in range(len(horizontal_union_rows)): + for j in range(len(cells[i])): cells[i][j].is_attribute = True cells[0][0].is_attribute_required = True first_required_column = 0 @@ -160,20 +138,19 @@ def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> L def __analyze_attr_for_simple_table(self, cells: List[List[Cell]]) -> None: self.logger.debug("ATTR_TYPE: simple table") - for j in range(0, len(cells[0])): - cells[0][j].is_attribute = True + for cell in cells[0]: + cell.is_attribute = True + # set first required column - j = 0 - first_required_column = j - while j < len(cells[0]): + first_required_column = 0 + for j in range(len(cells[0])): if not self.__is_empty_column(cells, j): cells[0][j].is_attribute_required = True first_required_column = j break - j += 1 # search indexable_column - # один один столбец должен быть (0) - нумерованным, - # один (1) - с обязательными поляями, один (2) - с необязательными + # один столбец должен быть (0) - нумерованным, + # один (1) - с обязательными полями, один (2) - с необязательными # поэтому len(matrix_table) > first_required_column + 2 if self.__is_indexable_column(cells, first_required_column, 0) and len(cells) > first_required_column + 2: cells[0][first_required_column + 1].is_attribute_required = True diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py index eb07732d..11c30cab 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py @@ -21,9 +21,7 @@ class TableRecognizer(object): def __init__(self, *, config: dict = None) -> None: - self.logger = config.get("logger", logging.getLogger()) - self.onepage_tables_extractor = OnePageTableExtractor(config=config, logger=self.logger) self.multipage_tables_extractor = MultiPageTableExtractor(config=config, logger=self.logger) self.config = config @@ -109,11 +107,8 @@ def __if_not_table(self, table: ScanTable, image: np.ndarray) -> bool: std = table_image.std() white_mean = (table_image > 225).mean() black_mean = (table_image < 225).mean() - table_area = bbox.width * bbox.height - cells_area = 0 - for row in table.cells: - for cell in row: - cells_area += cell.bbox.width * cell.bbox.height + table_area = bbox.square + cells_area = sum([cell.bbox.square for row in table.cells for cell in row]) ratio = cells_area / table_area res = (white_mean < 0.5) or (black_mean > 0.3) or (std < 30) or (mean < 150) or (mean < 200 and std < 80) or ratio < 0.65 diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py deleted file mode 100644 index c98d71a5..00000000 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py +++ /dev/null @@ -1,148 +0,0 @@ -import csv -import json -import os -from typing import List, Tuple - -import cv2 -from dedocutils.data_structures import BBox - -from dedoc.config import get_config -from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell -from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable -from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader - - -def _create_cell(c: str, text_cells: list) -> Cell: - cell = Cell(BBox(x_top_left=-1, y_top_left=-1, width=0, height=0)) - if "a" in c: - cell.is_attribute = True - # loading cell text - if len(text_cells) != 0: - cell_text = [r for r in text_cells if r[0] == c] - if len(cell_text) != 0: - cell.text = cell_text[0][-1] - return cell - - -def load_from_csv(path_csv: str, path_class_2_csv: str = "") -> List[List[Cell]]: - text_cells = [] - if path_class_2_csv != "": - csv_file_class_2 = open(path_class_2_csv, "r", newline="") - reader_class_2 = csv.reader(csv_file_class_2) - text_cells = [r for r in reader_class_2] - - matrix = [] - with open(path_csv, "r", newline="") as csv_file: - reader = csv.reader(csv_file) - - for raw in reader: - if len(raw) >= 5 and raw[0] == "bbox": - pass - else: - line = [_create_cell(c, text_cells) for c in raw if c != ""] - if len(line) != 0: - matrix.append(line) - return matrix - - -def get_quantitative_parameters(matrix: List[List[Cell]]) -> Tuple[int, int, int, int]: - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = 0, 0, 0, 0 - - # calculating data - if len(matrix) > 0: - cnt_columns = len(matrix[0]) - cnt_rows = len(matrix) - - for i in range(0, len(matrix)): - for j in range(0, len(matrix[i])): - if matrix[i][j].is_attribute: - cnt_a_cell += 1 - - cnt_cell += 1 - - return cnt_a_cell, cnt_cell, cnt_columns, cnt_rows - - -def calc_agreement(matrix_gt: List[List[Cell]], matrix: List[List[Cell]]) -> float: - q_params = get_quantitative_parameters(matrix) - q_params_gt = get_quantitative_parameters(matrix_gt) - - equal_indexes = [i for i in range(0, len(q_params)) if q_params[i] == q_params_gt[i]] - - agreement = 1.0 * len(equal_indexes) / len(q_params_gt) - return agreement - - -def draw_recognized_cell(tables: List[ScanTable], path_image: str, path_save: str) -> None: - img = cv2.imread(path_image) - for t_index in range(0, len(tables)): - table = tables[t_index].cells - bbox = tables[t_index].locations.location - blue_color, green_color, red_color = (255, 0, 0), (0, 255, 0), (0, 0, 255) - cv2.rectangle(img, (bbox.x_top_left, bbox.y_top_left), (bbox.width, bbox.height), blue_color, 6) - for i in range(0, len(table)): - for j in range(0, len(table[i])): - cv2.rectangle(img, - (table[i][j].bbox.x_top_left, table[i][j].bbox.y_top_left), - (table[i][j].bbox.x_bottom_right, table[i][j].bbox.y_bottom_right), - red_color, 4 - ) - cv2.putText(img, str(table[i][j].id_con), - (table[i][j].bbox.x_top_left, table[i][j].bbox.y_bottom_right), - cv2.FONT_HERSHEY_PLAIN, 4, green_color - ) - cv2.imwrite(path_save, img) - - -def save_json(tables: List[ScanTable], number_test_string: str, path_output: str) -> None: - for i in range(0, len(tables)): - with open(f"{path_output}{number_test_string}_table_{i}.json", "w") as out: - json.dump(tables[i].to_dict(), out, ensure_ascii=False, indent=2) - - -def calc_accuracy(path_image: str, path_gt_struct: str, path_gt_text: str, path_save_image: str, path_save_json: str) -> None: - from os import listdir - from os.path import isfile, join - - os.makedirs(path_save_image, exist_ok=True) - os.makedirs(path_save_json, exist_ok=True) - - image_files = [f for f in listdir(path_image) if isfile(join(path_image, f))] - agreements = [] - - for image_file in image_files: - name_example = image_file.split(".")[0].split("_")[0] - # predict tables - image = cv2.imread(path_image + image_file, 0) - # TODO fix this - clean_images, tables = PdfImageReader(config=get_config()).get_tables([image]) - draw_recognized_cell(tables, path_image + image_file, path_save_image + image_file) - save_json(tables, name_example, path_save_json) - - gt_files = [f for f in listdir(path_gt_struct) if isfile(join(path_gt_struct, f)) and name_example + "_" in f] - for index_table in range(0, len(gt_files)): - - csv_filename = path_gt_struct + name_example + "_" + str(index_table + 1) + ".csv" - csv_text_filename = path_gt_text + name_example + "_" + str(index_table + 1) + "_text.csv" - if os.path.exists(csv_filename): - if not os.path.exists(csv_text_filename): - csv_text_filename = "" - # load_GT - matrix_cell_gt = load_from_csv(csv_filename, csv_text_filename) - # calc agreement - if len(tables) == 0 and matrix_cell_gt == []: - agreements.append(1.0) - elif len(tables) <= index_table: - agreements.append(0) - else: - agreement = calc_agreement(matrix_cell_gt, tables[index_table].cells) - agreements.append(agreement) - - -if __name__ == "__main__": - current_path = os.path.dirname(__file__) + "/" - calc_accuracy(current_path + "../../backend/test_dataset_table/images/", - current_path + "../../backend/test_dataset_table/GT_struct/", - current_path + "../../backend/test_dataset_table/GT_text/", - "/tmp/backend_claw/out_tables/acc/draw_tables/", - "/tmp/backend_claw/out_tables/acc/json_tables/") diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py index 80ac01e7..693b8417 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py @@ -1,5 +1,9 @@ +from typing import List, Tuple + import numpy as np +from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell + def equal_with_eps(x: int, y: int, eps: int = 10) -> bool: return y + eps >= x >= y - eps @@ -24,3 +28,13 @@ def similarity(s1: str, s2: str) -> float: normalized2 = s2.lower() matcher = difflib.SequenceMatcher(None, normalized1, normalized2) return matcher.ratio() + + +def get_statistic_values(cells: List[List[Cell]]) -> Tuple[int, int, int, int]: + + cnt_rows = len(cells) + cnt_columns = len(cells[0]) if cnt_rows else 0 + cnt_cell = cnt_columns * cnt_rows + cnt_attr_cell = len([cell for row in cells for cell in row if cell.is_attribute]) + + return cnt_attr_cell, cnt_cell, cnt_columns, cnt_rows diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index cce14d01..b60cbed7 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -14,8 +14,6 @@ from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableHeaderExtractor class PdfTabbyReader(PdfBaseReader): @@ -31,6 +29,10 @@ class PdfTabbyReader(PdfBaseReader): def __init__(self, *, config: Optional[dict] = None) -> None: import os from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import \ + OnePageTableExtractor + from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import \ + TableHeaderExtractor super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) self.tabby_java_version = "2.0.0" diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index c357ac78..59310477 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -162,25 +162,6 @@ Api parameters description If the document has a textual layer, it is recommended to use ``pdf_with_text_layer=tabby``, in this case tables will be parsed much easier and faster. - * - orient_analysis_cells - - true, false - - false - - This option is used for a table recognition in case of PDF documents without a textual layer - (images, scanned documents or when ``pdf_with_text_layer`` is ``true``, ``false`` or ``auto``). - When set to ``true``, it enables analysis of rotated cells in table headers. - Use this option if you are sure that the cells of the table header are rotated. - - * - orient_cell_angle - - 90, 270 - - 90 - - This option is used for a table recognition in case of PDF documents without a textual layer - (images, scanned documents or when ``pdf_with_text_layer`` is ``true``, ``false`` or ``auto``). - It is ignored when ``orient_analysis_cells=false``. - The option is used to set orientation of cells in table headers: - - * **270** -- cells are rotated 90 degrees clockwise; - * **90** -- cells are rotated 90 degrees counterclockwise (or 270 clockwise). - * - :cspan:`3` **PDF handling** * - pdf_with_text_layer diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst index 20fabec9..46c03416 100644 --- a/docs/source/parameters/pdf_handling.rst +++ b/docs/source/parameters/pdf_handling.rst @@ -161,30 +161,6 @@ PDF and images handling It allows :class:`dedoc.readers.PdfImageReader`, :class:`dedoc.readers.PdfTxtlayerReader` and :class:`dedoc.readers.PdfTabbyReader` to properly process the content of the document containing GOST frame, see :ref:`gost_frame_handling` for more details. - * - orient_analysis_cells - - True, False - - False - - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` - * :meth:`dedoc.readers.ReaderComposition.read` - - This option is used for a table recognition for PDF documents or images. - It is ignored when ``need_pdf_table_analysis=False``. - When set to ``True``, it enables analysis of rotated cells in table headers. - Use this option if you are sure that the cells of the table header are rotated. - - * - orient_cell_angle - - 90, 270 - - 90 - - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` - * :meth:`dedoc.readers.ReaderComposition.read` - - This option is used for a table recognition for PDF documents or images. - It is ignored when ``need_pdf_table_analysis=False`` or ``orient_analysis_cells=False``. - The option is used to set orientation of cells in table headers: - - * **270** -- cells are rotated 90 degrees clockwise; - * **90** -- cells are rotated 90 degrees counterclockwise (or 270 clockwise). - .. toctree:: :maxdepth: 1 diff --git a/tests/unit_tests/test_module_table_detection.py b/tests/unit_tests/test_module_table_detection.py index 39b1b4dc..29d2e8da 100644 --- a/tests/unit_tests/test_module_table_detection.py +++ b/tests/unit_tests/test_module_table_detection.py @@ -7,13 +7,12 @@ from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.accuracy_table_rec import get_quantitative_parameters -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import equal_with_eps, similarity as utils_similarity +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import equal_with_eps, get_statistic_values, similarity as sim from tests.test_utils import get_full_path, get_test_config def similarity(s1: str, s2: str, threshold: float = 0.8) -> bool: - return True if utils_similarity(s1, s2) > threshold else False + return True if sim(s1, s2) > threshold else False class TestRecognizedTable(unittest.TestCase): @@ -110,7 +109,7 @@ def test_table_recognition_1(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table3.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_statistic_values(tables[0].cells) self.assertEqual(cnt_rows, 8) self.assertEqual(cnt_columns, 3) @@ -125,7 +124,7 @@ def test_table_recognition_2(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table4.jpg"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_statistic_values(tables[0].cells) self.assertEqual(cnt_rows, 5) self.assertEqual(cnt_columns, 3) @@ -140,7 +139,7 @@ def test_table_recognition_3(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table5.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_statistic_values(tables[0].cells) self.assertEqual(cnt_rows, 13) self.assertEqual(cnt_columns, 3) @@ -155,7 +154,7 @@ def test_table_recognition_4(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table5.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_statistic_values(tables[0].cells) self.assertEqual(cnt_rows, 13) self.assertEqual(cnt_columns, 3) @@ -170,7 +169,7 @@ def test_table_recognition_with_rotate_5(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table6.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_statistic_values(tables[0].cells) self.assertEqual(cnt_rows, 3) self.assertEqual(cnt_columns, 7) From ee9175916520d27e252fe760ebb952592d37838b Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Fri, 20 Dec 2024 15:15:09 +0300 Subject: [PATCH 8/8] TLDR-861 fixes after review --- dedoc/api/web/index.html | 25 ++++++--------- .../onepage_table_extractor.py | 4 +-- .../table_attribute_extractor.py | 32 ++++++++----------- docs/source/dedoc_api_usage/api.rst | 3 +- .../test_api_misc_multipage_table.py | 9 +++--- 5 files changed, 31 insertions(+), 42 deletions(-) diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index c68963b6..5538878a 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -98,23 +98,9 @@

Attachments handling

- -
-

Tables handling

-
need_pdf_table_analysis -
-

- -

-
-
- -

PDF handling

-
pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis +
pdf_with_text_layer, need_pdf_table_analysis, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis

PDF handling

+
need_pdf_table_analysis +
+

+ +

+
+

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py index 6271b2ac..c676b3da 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py @@ -70,8 +70,8 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable: matrix.append(line) # sorting column in each row - for i in range(0, len(matrix)): - matrix[i] = sorted(matrix[i], key=lambda cell: cell.bbox.x_top_left, reverse=False) + for i, row in enumerate(matrix): + matrix[i] = sorted(row, key=lambda cell: cell.bbox.x_top_left, reverse=False) matrix_table = ScanTable(cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py index 99420036..3dfca0e1 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py @@ -66,11 +66,10 @@ def clear_attributes(matrix_table: List[List[Cell]]) -> None: cell.is_attribute = False cell.is_attribute_required = False - def __is_indexable_column(self, matrix_table: List[List[Cell]], column_id: int, max_raw_of_search: int) -> bool: + def __is_indexable_column(self, matrix_table: List[List[Cell]], column_id: int, max_row_of_search: int) -> bool: # № п/п - for i in range(max_raw_of_search + 1): - if column_id < len(matrix_table[i]) and "№" in matrix_table[i][column_id].get_text() and len( - matrix_table[i][column_id].get_text()) < len("№ п/п\n"): + for row in matrix_table[:max_row_of_search + 1]: + if column_id < len(row) and "№" in row[column_id].get_text() and len(row[column_id].get_text()) < len("№ п/п\n"): return True return False @@ -83,22 +82,19 @@ def __set_attributes_for_type_top(self, cells: List[List[Cell]]) -> List[List[Ce return cells def __is_empty_column(self, matrix_table: List[List[Cell]], column_id: int) -> bool: - all_empty = True - for i in range(len(matrix_table)): - if len(matrix_table[i]) <= column_id: - break - if matrix_table[i][column_id].get_text() != "": - all_empty = False - break - return all_empty + for row in matrix_table: + if len(row) <= column_id: + return True + if row[column_id].get_text() != "": + return False + return True def __is_empty_row(self, matrix_table: List[List[Cell]], row_index: int) -> bool: - all_empty = True - for j in range(len(matrix_table[row_index])): - if matrix_table[row_index][j].get_text() != "": - all_empty = False - break - return all_empty + + for cell in matrix_table[row_index]: + if cell.get_text() != "": + return False + return True def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> List[int]: horizontal_union_rows = [] diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index 59310477..c61a6e01 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -150,7 +150,7 @@ Api parameters description The encoded contents will be saved in the attachment's metadata in the ``base64_encode`` field. Use ``true`` value to enable this behaviour. - * - :cspan:`3` **Tables handling** + * - :cspan:`3` **PDF handling** * - need_pdf_table_analysis - true, false @@ -162,7 +162,6 @@ Api parameters description If the document has a textual layer, it is recommended to use ``pdf_with_text_layer=tabby``, in this case tables will be parsed much easier and faster. - * - :cspan:`3` **PDF handling** * - pdf_with_text_layer - true, false, tabby, auto, auto_tabby diff --git a/tests/api_tests/test_api_misc_multipage_table.py b/tests/api_tests/test_api_misc_multipage_table.py index c7431247..ef64fb09 100644 --- a/tests/api_tests/test_api_misc_multipage_table.py +++ b/tests/api_tests/test_api_misc_multipage_table.py @@ -1,4 +1,5 @@ import os +import unittest from typing import List from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -45,6 +46,7 @@ def test_api_ml_table_recognition_synthetic_data_1(self) -> None: tables = self._get_tables(file_name, pdf_with_text_layer=pdf_param) self.assertEqual(len(tables), 1) + @unittest.skip("TLDR-886 подправить координаты ячеек таблиц табби") def test_api_ml_table_recognition_synthetic_data_3(self) -> None: file_name = "example_mp_table_with_repeate_header_2.pdf" for pdf_param in ["false", "true", "tabby"]: @@ -65,8 +67,5 @@ def test_api_ml_table_recognition_synthetic_data_3(self) -> None: self.assertListEqual(["Данные 3", "Данные 3", "Данные 3", "Данные 3", "Данные 3"], self._get_text_of_row(table[5])) self.assertListEqual(["Данные 4", "Данные 4", "Данные 4", "Данные 4", "Данные 4"], self._get_text_of_row(table[6])) self.assertListEqual(["Данные 5", "Данные 5", "Данные 5", "Данные 5", "Данные 5"], self._get_text_of_row(table[7])) - self.assertListEqual(["Заголовок\nБольшой", "Заголовок поменьше 1", "Заголовок поменьше 1", "Заголовок поменьше 2", "Заголовок поменьше 2"], - self._get_text_of_row(table[8])) - self.assertListEqual(["Заголовок\nБольшой", "Заголовочек 1", "Заголовочек 2", "Заголовочек 3", "Заголовочек 4"], self._get_text_of_row(table[9])) - self.assertListEqual(["Данные 6", "Данные 6", "Данные 6", "Данные 6", "Данные 6"], self._get_text_of_row(table[10])) - self.assertListEqual(["Данные 7", "Данные 7", "Данные 7", "Данные 7", "Данные 7"], self._get_text_of_row(table[11])) + self.assertListEqual(["Данные 6", "Данные 6", "Данные 6", "Данные 6", "Данные 6"], self._get_text_of_row(table[8])) + self.assertListEqual(["Данные 7", "Данные 7", "Данные 7", "Данные 7", "Данные 7"], self._get_text_of_row(table[9]))