From 10b9331149e46e99018f9379cd84ba31146d6d13 Mon Sep 17 00:00:00 2001 From: Yurii Havrylko Date: Wed, 3 Jan 2024 23:49:39 +0100 Subject: [PATCH 01/10] minio setup options --- README.md | 26 ++++++++++++++++++++++++++ deployment/minio.yml | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 deployment/minio.yml diff --git a/README.md b/README.md index a67ba1f..4ded965 100644 --- a/README.md +++ b/README.md @@ -22,3 +22,29 @@ DH Images: Works on push to master/feature* ![Alt text](assets/actions.png) + +### Minio setup +Mac/Local +``` +brew install minio/stable/minio + +minio server --console-address :9001 ~/minio # path to persistent local storage + run on custom port +``` + +Docker + +``` +docker run \ + -p 9002:9002 \ + --name minio \ + -v ~/minio:/data \ + -e "MINIO_ROOT_USER=ROOTNAME" \ + -e "MINIO_ROOT_PASSWORD=CHANGEME123" \ + quay.io/minio/minio server /data --console-address ":9002" +``` + +Kubernetes + +``` +kubectl create -f deployment/minio.yml +``` diff --git a/deployment/minio.yml b/deployment/minio.yml new file mode 100644 index 0000000..7de1ec6 --- /dev/null +++ b/deployment/minio.yml @@ -0,0 +1,38 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: minio-deployment +spec: + selector: + matchLabels: + app: minio + strategy: + type: Recreate + template: + metadata: + labels: + # Label is used as selector in the service. + app: minio + spec: + volumes: + - name: storage + persistentVolumeClaim: + claimName: minio-pv-claim + containers: + - name: minio + image: quay.io/minio/minio:latest + args: + - server + - /storage + env: + # Minio access key and secret key + - name: MINIO_ACCESS_KEY + value: "minio" + - name: MINIO_SECRET_KEY + value: "minio123" + ports: + - containerPort: 9003 + hostPort: 9003 + volumeMounts: + - name: storage + mountPath: "/storage" From 1fe56f59a69ce9386545ca9f4d9878ecd9205fcc Mon Sep 17 00:00:00 2001 From: Yurii Havrylko Date: Thu, 4 Jan 2024 23:18:30 +0100 Subject: [PATCH 02/10] added labeling studio and labeled ~30 samples --- README.md | 9 +++++++++ assets/labeling.png | Bin 0 -> 28647 bytes 2 files changed, 9 insertions(+) create mode 100644 assets/labeling.png diff --git a/README.md b/README.md index a67ba1f..e43e086 100644 --- a/README.md +++ b/README.md @@ -22,3 +22,12 @@ DH Images: Works on push to master/feature* ![Alt text](assets/actions.png) +### Label studio + +``` +docker pull heartexlabs/label-studio:latest +docker run -it -p 8080:8080 -v `pwd`/mydata:/label-studio/data heartexlabs/label-studio:latest +``` + +![Alt text](assets/labeling.png) + diff --git a/assets/labeling.png b/assets/labeling.png new file mode 100644 index 0000000000000000000000000000000000000000..294c9a0c5cd883e790fa1c63a95b567e3840d9fa GIT binary patch literal 28647 zcmaI71z1#Vw=leC7?6-q8i|nxmF^srR4GA9y1P4v5EP^YL>fd1DM@KiP`bONySu-Q z`aTcuIp_bmu3JSUjm$V$}vQnUkNPO+SUkz(8Bjc+6x4?PX+lV2rd5>K$hsC5dou1&wePi^+ClG;ZWBtXvQOT#w5z58+P{d;MkP51P0=K!rF zaQ<`p!_}1Y%TiN}05?w{g5*^n%ORLS{mLsFpo#oaFEUnU4_ch<(v+rDa0BpR0fWnv-|^p(=x zvX?y1UJ;&Xn@uenp2>1rNk=uck$nGL!5E_Pg-<@dBjLa&W7a1&(egMI*Fr6uTHQ5Q zzmv`8Yf*+cBOBoirHqC$SK}yN9Whq)0bV*mq>zx9Q8?N5E+3o87C%bAdOcPcau|I^ zLB?5m4oG>&f;0c{rYgDyW9Q<}N4dD$)Oy;|o9e2 z=hQLQAm^&ocU=%v>BO&RqTm?_OLe&18^8BBg-Y5gnvKYgEb3bf=G~I?;m;dwFzj*_ zZWDd%ZNls|N-IP;pyPM*-~Cbcs&PCE;>o5%K$YK$JJ`6$G#ljbP2=Tl=kKXuY&M#6 zzcgXrf8h;`7qX<$-nB+7)^Le3s&Er!N*5cTt=T!~IluSg4H|w>0Jqk?pZp$|_~T)kh=9yd$qoLf(VSp2g0SoTXF;Vn63yk4kX^jV?s_o zYMBul-yuE)*hPRe`cB|aR3?v?slZN3klMh&t&jISu%c;->hWJWAYjdbhxYS~EfHVE zw@5foZC>7GDBE;p~{fztk-q?s>)>7m2##}Hx#yqj~p~bI(k%|Mg&t2 zO)W|hT8t{x@;?$5Jz4aO^@3Q^c$YL1=>>gp7-m|7I^WDey}w4Q62xuLMCfYjB&?F3 zFJ@EN>ROZy<<>dSInkM->~;M8;SWJF+YITp2aj&DJpy-(ny|9;t{#z_x+^63oDJmc ztZ)1!44Bje=o^gn=+sD6UavQ-Ogq#Dml_BU-$;DLX-l+$u4>DdjNIMU^dzcD{C z>`{D4En<3z3#n~y)wOo&lr|#;w$5`<3lvE zG=k&Bt&%n3My*Tn%~ku8Kkr9zx2PCW*>kBQk|nhzC*|@Pbm(2)o-%0=+zF4T3Hn@4>+yt)RqA9$ zZAOpjUXS%~>V)o90a-ceUB=bi%SvFx_OnRxf#n(a9JQl0BWu?xq$`BsD2R!S2 zqZBva=zaUecJ$`>&H5Y8H*ypo!a{LW8I&@WO$y{f{?px3IBinD#>-uGZ*m#854WFqC3b0Zs4oAq2Wf<-A7S+6d+Ok? z+Pl`KHhhD&pQq^?I0JBu8ij~@8Z|)A@U&8vG~*&ezi%LeK{ln(QQcC7$ATi~^?-%3A(1sb@0j_Z{-BMu zL9(t$muKCv7gq4gSc%FX<{^?H+E0|)*xzy9&v~$^7MWL_E3RUZ*Qs7pTvjMoq;0BY zF=3Q$(@@G(lwDR~@}wC1OGjy5NsX~o$!<|w(U|FwdH876Sjbx2I;&tR4keC$P$-U7 z14ctl(33z_oVsOcCqqFZJ{3U`M|}YafeweyPVVgA?ZsPP5z=&q9?h(6B(7Ia9B)Xk zN2W`4-cc9bi3zE}n))<8AAhfdyWgqIB5Cyr?+xCxG5W7BHf|oWZ(vQ_nGp4Gb#E3r zaC7v~aL;g8ah=@j-45Q?m~om+_(d@%Ja+p2@CFrbif4+yh|jUWiShBq!8(}{=VLze z(<~fO2hrpyl{#micRrsli7pR-Vn7>VRG*FrL5TedtIxC)bf)*fJ(Y-Xh`7Hb-7Oe~ zYeJuSmjnw3iv`0PqX$O@eFSd@-Y>*OQzB5p+b(SBuJ(xUt`N1S#F1=1du&(U%t6=z zxv+zy5{#RgMgLJ_y7Z~Ey5TvAWBU?&&d>3XqTsjU1Csux)_b3qPT6_bE7B2%u zUx*yYtThjl7B0?^*hS4cH`RpV2I~;{@P6YFOYVN370GwEka=6{nQd?D#{)Z>pB>~b zOe1XSiJGwwAIm%)iR|SSF+7XkPQo8&AdoL-?=jY&Zp4bZ$zUWeAfj%-VY8E_{LuW_ z%0|C@wW&5;0c&W2O5|YM;?~ek3KkSK2X*41==9_1KF7Mfm}M08LMpjo*Zo8GVhj|ZW^me&U%|bGTA7DN>F%l3- z60NZd9C=g0N!e-GH7yDnhPMy=wb}&<4&ZIk|Ble*@-mOfj zW0BPvEowIy$(V1Pe>=~s0?QpPeP&_Q(;D{NKxH7;IFCjB+_+jRvrE+cE_bh$8A621#p{hKn)}voXTpgAd!e2DLu(rY;(@-+g zQZEZF&(j@?T7Ny(bw0EIWB{ju;*+4rX7rln6k%0Pc5Crx^RGP{Mgyj$)s-Gy0m6A6 z{)dC>4&z^)L>+vVA+`wbUn2$Cb(4G~c3-w5hSI!(@^Dni^~jmB#mqAX_6CE^$5!92 zj)sMajH+$E@R{SyoN}h*6jbrnJUlAGkH*K6^AuKhEGiHi?RH?ERhF8a`&pIQlvF#& zRhB*fv@veil{TAZ&` z*&IA6Q?z2phiURVaUZRWSd3f5=1XbWR7Kj(4t8Mhj@e*XJ56-fp6|$gOS-6X*toS3 zGd@(Ub5Oq|Hc#d3vu1N>Dtf_oq_>-1F6AP4MA1dHD16ah@e5;JW<;qk^SKs_R#WAp zsuwAtK7+@aC)IVCNAh{1_q2y=`OZ{->2%c0x(n*GYI}Q?&Xs;!O|luE{RoTZlWiL;$xJ06dIKMV@cwsO*Gg!8%7r2xMg? z&cL_g_#Mq`MsZi1;hB$!yd>88a0#b{GnNgi7pV$PD57PTP_}9JQr2^!T<}5>Y@{Y* zEGr8zgU{#y3_=E=f=>|eBLcbe&$A?i@y{&;fC4~}836k`Mh^VG`iTHPS9AV+qr43S zZh(K?20yN;(Ep4^0a8)^^9*?m!T>R)M=~f5X-2niI z=&n8xnP;?H;QE7R%4&{kvQGsJY^+%H3~ls{SX`}auhs#CTm`^KDd0^sx2YgTIb?;(yB!qjTA3h+lZ_C|0X7B&_(Y7tC094=&UXe{tdT=MU9@GoI% zQ%6Tz0ajKQ7Z(;6P8J(`6IOP9etuRq4pt712jGYY4sOo|>+0#-I5`SaQ(tZL&(9w`ja<$C-O1YFFIgagtXB|Lb{00)f7S-43SGSw zP%v{fvQ!s0vjTYr?jgd-$tLuB{QrXdyW_Q)YX8pU<>9+N^BUxT&V24*WdF#<3f$6B z_9oqQemccKvM73J4B@rTX7--3J=!4zWsM`j|Jv0Y;9pcu)`#O0O2Z&1m8|9k>} zGlHM1Z}7uVK?k(^gWoS@#Kn|dA!|t((G(KegoK(in9P`2FoGT!q2g<-qH+p}*+9mbz|_>NSLl+Ug24jW`Yunnv*ny* z4?PM@+$UFu&IkCTCe6=3D;3yGy!fgSi9I_zn=xU12MU7&5Of3pJ-@qE;B$VD#q{Fr z_?#)?+12Y@n(nvYt^Ep&hXi0JPXI`2T{ROn6i-b)m79l$i2byV65Zqq1_yk-@esh$ z4Wm}@uOjqDGJpx9k*6}~up#zmHk=I`h6iLonAINwP~ltPSPcBgznjnBMgWYl3>Dze z8hq0c)cRY+mo+Ff^nZ{`y$MAJj1*{b(Gj{FFg2JEsrg6Wt1ThuPa^?nJptq`ITY_b zm#-R1YQHIjz9S;wFG}CR9WiNWATYSXZOBvPW2g+zUkEh_3GSw50U`7tr1hhW_Fo7k z1{~N!4>rX^B9>v@Ky+C2&AWeCGyy^!O~h}4Xt-YyxQfyIg>-lipjZ9FAp{WwhP=-_ z?hmp`48TT+GAg0{p#Tm?0(ijmZ7AN-9mpWChs}-pmn(z#2tcjNj}sk{S_kg77t8zU zFGL?Cx*8o1S)YpD2t^P`L1ehz0#F1^Rg&bR)Y~vN$Y*h01&rS~n391g zWbE^;zeG5p>q3>2YE$Wd)71?Jm&0DMjK}m1zV(V5;DDHyY7pj-4g6(fe=NA7*Xg@= zVyP=vE4t$UwJSj6=SYQx9k7(#q z?I-xHC+6ayo=7IiD8bnbK{f~GHe~%lyWbz&MEE391))oVp{xy*FOIgguf()<94$>y z9G96`Tk=e7qH|g_)d>8&cVgQ;Pn^fP{hRr^eBfp`r9>5J%-|`+Y$)W3|PBCFrZB-`rzk0=XCUU6L9U6AW64 z4?Nc8_;kN@5b-pdgEsKD_P2s6oMkzrz>eOZjF>-!FbbfEE>S)LNE~q$L_neWPJ=qY zMt&~-;I4ucH0BT*04OvKh4K86HWb(dS7BoWx1o!YLQ&vQC<3(zuAl_I;9ggYNAjrXEO?dSuCPH~en=Q*mj45Vs?xg)J z+4MfFT3`=%doyAbtA(ac*cynSyD|?t^S4T`pSOi^%(@h`xW(5B7qu)HhoFSZP9@F^ zSS_ZQ{@kp|$<#D%A8Bnruj_LU795ZUxe;D5_x8zKOY5q2p5h-wFKCC(`P_!8CXGan zzTwhKoGZ$i^&O~IFbhlNtmZO5j-hRSqC@wu*MBrj; z^WBCKt4AFF`CeZ~MuPR|1M^SGbu)woY#_rHqXf=Rr!ILM7CUA0U8cNT(k~qK=38SH z%;Y`dtLD&spMrvpQ?s|h2?4ey4{L`KZNFnwtlQ#J?xo)NK{b3r!M9O9+dlZhW}Omy z&-L>BFaZ^d==}@p3cCEJ!`egP;>(Lu+!8qg9A7c~tTVa#Lrsr3^%{G(p$X)RJMA3B zhkVWLk$sX~78|5@wh@c!P zk)6l|)w+}C=4e_y1+^#UyQKwQ4`kfB#csq&gb>jU`3o)60=XKct?enIqR)jf#8b}Z z7?cjnkqBu30;laguC#_ChDQ%;HoX?sZG5WY_#KRovI>TJ7p#jrcx2)}#|FeR2ez?P z-S638)hzE8T2T!Dtk0dTgpIE3jg4sL)5`1kvjl-P~l?#&R_@l_mFba{0yHB@1yMFFWNFRAR~&pi&SrnFvu zR=R=SA6Wkd6W}Pk+0el|Wx2OahGqtp6FHvSa@Hv*B@)-@n7TOlYFANrv0u;u>d4@b zz@El?`e4Hk*;~nn)lK9+XWKRObd)emFq$z2w2{D&`}HfvP{)ho#e_#V{DxjA>m$V} z=avrric&v#(#LG-YAOy!Ejs37btuQ0E~jyQq-U(lrvn%2F3)YuDSonH0VYXK@A!9` zX=V@BYYrz1*K-F&@IdFWgMKp|fbUXH33X?t?FSHRyXsj!TyD48I|H?<*6pM(WkYKH zv?2hRlV8~?bT*BflA5>fVI)BC3h0L5gg>TJI1a7-&pMvusLY+ZfB+4(9rnA>Rz$2l@-;8pG@A19T|F6h;RCwJa` zeJWkxKJ3d{BudY+5|Dn5Cm&;01IWJWTmP9bb+({Pc|cQPaC?=MAwO#6lV}d)mG^dt z&c_{ex04~2Vd@_<@#bj}Q;-pYyQelcht;ola0A_Sx(B?A0`RF{=L2&h5F#d(@0X2}7 zH)i!!i#g>Bxz-qI@ls#Ph2yO^jV}>2=>6)z7w60D!fz?LOM5+^*=Y!X+^S_|ZS~nV z8_Ou?45G&}3srrx00{*}D8v1oorBFM(#@EIN%jkkAt{4NRNhC^GDK0gfHn8Pm<7F6U}IkbxPtyR(xyvO$dV2xP#5AV*Xv^^EP8l%IwF(A!1lS zujicIs$*Ri8UmWez0IUd#Hlzv_T8xu1Ei;0sz`J}x?%NUGN3D~LM^N7x%7z2UXfQ{)n9}jEmpt`(z8Yt;^z7#p?e(Gs9vO;> z*ZX17t+QYdu-^N@>dk(k#dkRFK-1BD`gHH9Slt~ms{KU^>#G3GaF{2PdyjsQQjIf( zmX$pcktUOya=vDw_^#fNLXCEr-xrCV3Sl&TNPr3VcMr|@3^{6+5_x-*k}P7)Z!kK$ zYwGgiaCq$}Rc`LO(dPW)tAPgc2|l2!wB;kYm@+aRE0w4SDTlb|Qb);~HTA ztx87fo5C{c-V+VGL)8o170#K1{tC>vBVG$HIbeXvvU21S*(z{f*f0Ir(P!lV2z~;-?*|%Xt!Ni^$0ZD&&47a zG4M0&R?baDO<94X%|!abvu+u{b!=?%-N%9UO0D~A46>cvh4Zm#Q<%g!!h79waih_P zTAyU-MZ;~Wvxy<~mAP{Bk*e>UERq_FH*&R}^FM^sBhgaR$dKBTKC3MZq(WS}W|9rK zpajIY?wPVp7)HrHb2Hn{@)HR@|HgKfwzd1B$WVqkzdwT|nh7Wuim5a9`-pk4N|OBK z-j+YrSv}p8uyZ0nj(s@1%`rSln)2AJ$>zunDm|*$PmZme>cDvEt)!fB@h~le81f5D zBv^c|?(#k;&UUZFI`4d3-kY)_heXOFxdCiqK&3#N!p!(mM&A9IIb!?9GZ&USGNMZ; zeZqZ;83Jz8J>lMt%=rmP80#!~r(ebIO_$(ryf+ce=E!^mY>j<@C@(xX>De7yO9|q6 z2h#rV#W1+vsyF_##WajK*@w( zU~lwmyi;dU)8=zvOV*mJVd}wA6gSnyR*+K9dAR2kOuEk3+k-32{4LF97qR`^vb5Ow z(xRRVnOuz4+y`O^;`rE7xol9rxmqQ}|B-Kw^l_sV4ulK9CIWJ)E{7k5h}t>{;G!ZG zP`X<>79lstBgo(dn15)&f4segSJQK^pJ1`9jLdC>U8zW$5nb@iRK08{>j#1 z%|*a@cMkBY?orZHMy(6xp-&iAd%G}Xck<+?&G{v+L5BWV_X)yY&NN zRxvLu%(fex(4+E*msh0VL_na7h9P!63acKU|BHUCi2g+DFH8HwvOseYip!Apl3`x@RPT`AK{kAN=;@e=0gr+w`TnOoWNUd+!ACtRk43B9_Z$b_WF&&?-(wCP;z zC{Ki-1Dmc*g3ru$`@5JQiw1bt-31Ky(GKGu>w+5S<=AS5I9l{LULH)TK?hD)wwrK# z21QseXTt zY`FYHY$kLdJyz6bTN#YRn<}zT=j8H~(AU`(JChQE!8E`YaT|+931dvS&n;`V-9^z$ zez&uep*oQt_QyN(EyY`gQ(lL%KOX2qX}`5yM!JE=94g}!%^=bIXFd4f+5N`V**#)0 zD@{5D4CRcXK2J9!P;^~7Wq;CJJzNSBg22g}O5ZVeoiS8zCTa(C-Y-gIJ#t!h1b4PQ zHpTWEOq0o>a&PZbdypnOss@F5Lf-@2pqs>XxkO>HLEw6|20;;~P+EuNYP?Q`xd%{@ z)!gEpq9s?IgAMNl`q`bedKBEJ^L8_@8JzDcg?v5ZpgQRe%9sgy_*#iLlRB?z>v@HK z4DrE;Ns4k@ZM<+vN0absmc>c~up5@WGU8jq}bkpj*CkGVqzQEZU zmm#E+Z}trv*$_4Zo&^xtB6$-(^~U6I_IsekW7QQmr1nv%=J2=$-XRx+E5Y3gnrI7X;m zY(aN2TI!0a4E<&oGhddv-PNRtEOkid;#hYUjLBpL0_L+VQzGvPT1-gXg1}2wRrzBz z?O(Tv@SAU^6pScTYxf_?pd}(w7r+FT02aVuV`q?60y$CZ!Wz}_4b@2Mlzwrl!h&a z0+Wozx+k>0=s+d{JV>Mn~>S(7dV^>;nMUeJ~-|2}NWX4II5C=31Wp$w{$U zz1ykWF9$^|iLI~X6v%zrXPq;W8h!4A3kV-FsO$;8{~C)ps)C1BQ+YxVSZ>9Btd?0D zZr^kY=e*`i3bRBX~E7qdEJLLnt7N#7pNSUll8_JA{7u53$&ZZIi8kk%w7#fg}qyQgo29k7l=}o9Vl#=W?#B1i;m8VeZ{xW=z@7Iu>T2`DdwgN_K8b)*uB{7+w~b zGX{x7o{UdnA!enQtTjSjp5to$=m}6G5x@ZA4R<<}jXIfd1@w-#!+jDPvDQ~UN$fW% z=Q*HjrGBpSvho+1YoG}7l9zO&T2I@(DZ3RQb|);GB43WxYxzk7qb6@6NHaDe!1b!q z_wJ_aP78z7g`8g+c19fJA*^8^E1K^~9X2>&f`^Nzj3rLWaeY>~Fz7PLZU=sUEff=Ll##uI-a&^1x)mUZg6c9$+#B531Uq;iWeW%l zMP|@)z`Yz=vcKIUe{b0NVI;^pzuD0J<2R0{o2^5@kd;4So|sy-s~vOr2Pks%0napU zNKR0g;M$i*XAqwr31TMqrc{BJ047USMt~8Yo$u(EKSO)|6W3m9otH_@qfE>AKG=ve z+X>P5Tcb`xT2Zd~8z*|U313<)BCAA6Aqa_D?`kPo-FBu_k<9zB&_=4))K}R=% z3iP~9s@}ZSOl5H9z+ckcWT&!3512mqm?(90JfF1s^==M%o?^tYfwq8)juVE}rB)I* zQI+e3qK1#CG*-^RiQP_TEQ{6(!#p{Z-FHv`Cu6{T#3j>xI@yAiGe?z$cQ)O{C0EnJ zU)}hC@$oG^d7W_c4VAV=oznuoiAY4K(f?|`Mp;{yA2xQ%gVqS?Wu$7XV`p+u1ex1s=eMPq34h{yS=XZhZ1aq0lCrDErxvP%IhwT7q!pH5`PlEJlo=3W1L&iqR-~teRX}7>aB+cHRB&=EHujw zc7D2(woy2&1q2mlgxLwKzNsH>d96&j@m)ZU%{el^a6v@U=;Ryb-GyeFa4u)bNwxLN zOGYPa3Z6!*-uFv4K27;to`()3->C{~_lXwxI;y+tM_dhmUP}nParzY*Y$9AqD!6L1 zMjKKHl~{>m>6cv19~(Ybr5Bn-gJ~3{6emkCKhYnE3vxL>a*46&8gI6AYpxnMVrqKP zC78C_RCh0{W}4o3Qn77>;n<}FmnbV`W&LIjH2j#JnzU^@w>Q*7y#E&&XdZ* zg~dMtkHRH-O3x3WBzrfGQrm1gbEKbK)rh!4K;9LfBn&=GeGJowq&_CZTrDymQ8Yf= z;GSM@UnO-Y<8Zj#ENM51s^MSSVR&KF`@6b>C@`GILPq8CX8Ln)q1;tk}h$Kg@%7!@zqw0^6!oL)Mhl3LN_z6F(3Wx=P>2 zh}1~1IB0J8wa@f-aggLHaEKiO^N&(1t^n3kefECBKC0tS^uwoq{ z3tl8gG|RDm*ZA<>tAJGZ1;@67RWBT{i0X)Jp!tmjvflAZdsvF*z>$tLV7|_o6tDm) zBWL)3(}hSC2WQfKr@aT(_3*$08s;=L$6p9J2${E~B?cjUAS9TR2K5g_w-sc5#+!(n zAht5M|5HkhYWFkzEdo$*1Ym?yh$aCmm%c?;MOY%4oxfF90}z@*^>k%z$@422f$hxzzfXr)0SI*c+wpMGGA3(928T=YXDl$t6~GKhR5Z*L`v!-2%WrO$>jGzy`W4sjGp4 zC!tvL;_>Q#wg>qKKK7A;iBL&N$u|N2dsZQuuBBz}WZp-o|48zWZ+Hqoki?L`8v1*C zZ0hdH&DztM(0VnL?o{o}>}wN3>F}x`&sWt^x`$nmfqxC`3JI=Ppe3L=@yNdXOV@Aq z{Hgr@M=k)2OdQSwd3!Eu0g&~ta)D;@2X9af3$Fq|)r2uN!axT?c8z)6v{yEJ3Y2K% z(U&Ha{jSdc-2${%5W{Bc%$|SN0id#U-5OQSRR>nd{eZ45a~$V74+5^cVlIfGZ9GHbGEP*>IqYULzmy z{RwuSm_AkQQ}`DF2s#OV732phW5ItZN4!`nw}1dy>6n#2L#nYOU*2C2cO__39^YZ? zEbutYbt;j%e+`!oIOsX@RNB7am6KA)-Mh{$Suj=@LK_8;#&zf=l>cCgNF{|0-$*4r ze|mj191p)93y<}`5F!&tdV!K2xh`n{wB>4)YY5<(;`zGX`XU6h;Da~G>wqj?Usr4- z3^;&fxk&X1T&MpPq8fjzxf%Tw*b@1z$Ny0Y2y6@5kCe-JeSINY(7eep!mhtE1t2_7 z3k1V;xkAwSpoMo7v`p`^{ZB-|{MY`5j>e6D1q=8RKwwA$9F}W309UYq7Ggvk0J>C2 zcNea_zwYlhf=IBb2I*YYK?Y3TxK0Td@=ES^?r`R*6}(AP_1ToAF;L(1M=k2de3-mloIA)gOhhaCGDX8)VSr5&LVp8mQNoNt>$=(izV7(1 zh5+gR54{nDgui+71sB*vxsEs)0$@d8FB!)F7c->*NPXn5>#uy#@Q!={0Jyv~NR-6| z0Dj7=hP?Hp8}A8z3_i`~p(d{5Yp!EW-@HFPKu|6(E0p*F)}@E#PCq=0{s{2MtQ z%Bx_A!$UFUo3Fn3_{`^JWPCNr@E93M+VJYyt#{1I-|((-$*BG8AV+v{w%Zl>Fin?? zjO->Dey^G4TmoXcpzfmR!6lPY#@wk~h2V84qMm>R^6SBW!ohDvAG?bLOa;5*1+Rw` zWFZS;=aK;p`Srg{^q4_c3RqcLxycS3`wWlYz0RmL^!mA$s89}w-gO6nMATy7p>(%* z5vzVSMtl5kR7`D!jcQ1_>h*q?$ggzEZ^i_;H7Brn z+l+0(RpMVD`2e`Lmq5xarOZ($E>&oSLHT#`tAtfV zBCEro!*Jc91TCyz#kSZppM0$Q4S$lFw0}2p2))KKLiDqPF`KdgAwgPThY|W7qx}05 zysu;X7xzrXMK0|xzg<)5B6);0BKU@_jFvqU;FlJLHbIgOv}mi<9qK@JGvbWDABvI+ zJpKnC-*^T%Zn%!m`2gl`Y;FO^MFuVCe2~ApYXzEy<>u3JvV^eoB?tAmRouTH=iF(1Qa&w zUp}hn%X^pU^V*vk%K4DF6OSrZOU)3^0f+)lU%yu^cv5x+QzIE0h8X`k4EQb#EKJ`>m}eh? zA{xkGPzDdxTR;n+BIZA13}X-p;29S}jRPOK7Ys=MeEsJNbaj$~=fLb)f%iAy6FIK| z>86n0e?$bgxPtg4U!u8RM%BGTN6zIk{_B2@Rebzvf z1D+=p;%?FXqYH?47<1ZyDZOrott2_3Xovt3@dp3XzhIRsa0Dvj|3wWRpzfjnyjB~A zh=C~m8v{*^pcTZxQ`ox=>_@=ISqZa$kd5HJg2dlI;Jze58Wzw);iL@Lv@G{_a}A&i z`#(AN|GO7RTDr>Rm#fQB45{M3QPeOFv4M)zM1r2@6-4^z9W`*kEWek-&G^Yl7w_Beq z6YF^PcplHkPOs-{@@F}=UA-OU(}6ao`2ter+Kv9MAd;W zFLW!p$e=0A80?tC8dCqup1CUy11yMT?I{gHz+I6+E|Fxi{eE$~zHzeKf^AK|ADMgI z`N50wDy2+0W6-l;4yhj{)YB(e>QjOpYvoWpkvhJfemwWk{k zfp@IYhhErE$Ea$vTjUl>y0-E!CiZ!Pg}&)ZB3d=;+GFG0K6%kNE$5LR(gc4!v#$^y zs6^f9_0dd9J6pLLbCl-Kf%`?x)W&;DJ$L;HRW*C6PgZg=w5pt^%gSGxHXP_&p2YI+ zwsXz~mn~Z<%2{avjAr#2)%Z0G19; z_m&O9_-0-e(SE>w0YeDoGiH{t03s(#G{wXEA%x#5!7d}&&B>aOvcf-I!FbKoS0elb zp+hy@QRRFBMY!yP)vvgGt*Up`-%J~bAb8=-t=V~4NZq;agM(8VAjRjRLS5Gvbp}%O z_evIV_sWiZr(7RtX(cIMSvA}OsvGj4j15<43(;~Xgvat0u$}cGakLV^`*b+vGnF-T zxtQWp#tvBIR~QQk{pX&>aizluGsON=JfLN=Jr@I*7y#?2R%Kv8WBAUSt7Qt`KaIwR z!jth|{Pjj|@NRk-B29tq$}?&eGXl}y=KXO+*or#^EYcIeIrlJtiV$J2!*v1dBIMNZ z+{<(hIA~nlNri*>{r*}o9L2t*qh6@cI zgrq$R8g*d(I&S~&<+CDhur_!g>_YtUun6q2wT_Yh?eMY?V3+)fHP{=wY1P8N@V+6p z=3vx#`v)_!Y&wZ0p_ER7AeNPARH@YtM#9TbL>uCt-G33Y6jXS)ekCB8MVGm!4iw6~iNe`|I$ z;JTC+bk%N67ED@)^WX`imgn{l<33s5&=$O)p4~?E zz(qS4CMZG2s-S|3+;|pRIiT_)Zz(Vxdkj30c&Icy!T$0%L))*3(xd&LJZJ6KH~)^4 z)dDKYjJ=O5>xb*vnaLb?tjp8tTFyY(7@sU>;(qno?cfcSB;nfhU9|%pVooSb`Zm~p zJZotnD5I`Q-^#=c^y7Pvnq|gev6RpDxxxZ zy!E9^PDx!N=o+y=oRZazlbPp=1-f5+deFgTO4_05CtY&zf z9!yh6-6^vJ|E;0e&gW>Q)X7><@8vhYDgT1NGqB-uX_IHRDV)__N?;``-LmyrT?GBa zsA+oWcUjOQ)Qz~W<~8i+m5sKJgFWHSS{ovE4N;Xkp)6T0)aLUu)E|sWDd!0Y0k8w4 z>{eA)cGD&37(%VfqDr!xG%NO!z@GjkHB`5I!((5~+>AeGl0ob1YM-^;4v`lVyw? zdM1$7rq|=JpKE5L`}B?wwvOeS_wfbor)Y#vS(*E zq#A04jx<;3uuW3G4cwse{KCJK>fc`PixTpxjraxj6ZI0kJA{Dwcn$Y-CfQ`O=CG{6Z*-*2$LEg!FB}Db@LvMtzvf^@KZl$2&#`<6%-!YGF$m?hEGW6R zc$(1GH5N|V?UmEBmLQ4c8GGjkN;Fx~UbsV>q8xZUZczHxUE5-0wo$Qoz#A8%qLIxg zG7Xl$O4#x96_kNjrN#~_5xF)!9z7ycH8~&yxM4`mkDUUm_b*j)(G`FK8%#DG&^xVz zezh`Nn$~QVvrEXpeSc-1N?$UR%t$N9Gc6ckeAtKyo^X7NU|jaOVR3v)Qc2!Jv$>zb zL#hJJE!%1U1u_Mr1zYSqb40sRk~%&hdU2Glh|*?5lD)VWDMeWrPB;1`6ccIM@WKwT zYe1T3F*k&AKFBdq$)G$kdZ+$#ontlcYjP-dcc5uH?|p*%)nGq?GXV{m0-05Hj^Z-x zWmv4M!XWunjbuLJHcMwTHX}=4#2D528~YtpM2Tg#Q=!tA&zw`CGGia^_bcTq`em6l ziVbZX2A`&5bZobOz6&_k*pa3h#~Ho*aM(w6H#qD}TuRWB$0D!vU5EB`*&y$9CEo^PcrYXaHLB!IqHcO_>{}f`=Pl%icqT4n+@e<8Kp!=Zk6lqrJAPb_`Ee@wX($dqa_y*`;JX?%t?`AA$ z$@iu1xS0*2h#qZ)KMnP`9M)w)2G(1BGEE>W5W|`IVD)WI4%?jZvv7Xjn;1OLNA1J>V<=qOD1K@D!s%%UfWGj8Q6MEC2qZ#14;={)dDi2FMjHNDgcXL6 z;adBBa`%mf?XUotw}EeUsak$?ck!x4oqJ1k`^D24UYi)mK07Rlel7R*6I>0o;UuF! zwbfUr(~~$A1D}4_F~-}?h$!A9+3if0F4hJ-+_O!Xa4D#4u(nQbjtlysl_iNHIBQNg zzqfWsd@EuJOiRQqXA$3qV%`RigC15B8|6z9lVA&kB+EmYyhJPoIG@$eyHh`8?9lC7 z)LkZ=@=l``xJbB6Bly7sO@mUf?-QfXp)v-5vP!<jjE=dgqleNmdJf#VV?*w#`~&fV)<6za_h0hy6x3c1Q(4H ztW=h_KAg`>0)FK$CHE~F>CElh>=%a?OWpu5Gs%z@%*IEqm5jxp?P!ofUR8#n+TES@ zk|ZX9yt=JN@%vI4{}{eW14xF_>I2ND7lOBu268k=2*A9<0q7dbT7)DLJ?sCr@cDirM zk1qcI`Z~|BCbqC^PY^Vy2#AV+2#WM7O~C^M=^(v#M0)Qi5E8&+p$JM71VNGBq!W;) zD7{HSZ_)%pReA~Ej>q%*U9X<^hrdR1&Fq=k^E~Tb>(Qs0J1~w0YG_v09F>ks8`ur@ z%95MmB8Q41ODuFgd~(~~n%b+r)+CgB7UtF|_VvtF?Yyws!bzUyzjSr_Y)|XIbpEIJ z4bmbi3)r{Tq^V1AsAldVqmO~6OpB``*q2fyk@T^#Qm7;{q4J^PLEA8k-f}=gn1#d(EqqeLTHX2I*-I2*%wQS zZslUT>@bP%Pn%)>u3Rh$ukdL4xPeETJH?jWo$grP zt$QHZ_mPkwH>$1E^!D}#`q+r7z^o)D_6ep`c^{19RTcd|c&3b9At*YP+s z8YZtjxm-6QN{Vj&r(^FvL+oW8_S>xM$uQpx%}!!-PH2=Coqv;*w_V$`)0J{>Hi0#A zZ$2}H{s|~#pWQ@ASAKX%$$#&i&pY78kfRX$$Pyzc^&%pUJ7Ja){`q!XyVskZV`S3T zKE{YNyE>VH_)|oV=3=na(**9q2X!OpX}_6rjJlLejuT5o<)k8Gda zUFpkzF%SAl&ylzS$Jy%N>AFONfSsFU4OC4_c z9Kmtc`3~7XnpV_(#MoCDNAqD_K|v=0%?A72*TxJD?K0FKR2(Jdom#>#N(?1=v~p(D z*4Fm0+JUs!idm`YO{WKRHXK8rh?N>q<|HSo81m4wcSp6szt1SPASdCKW`49HkO(t6 zS818R+>zbW-tRK4C8Tz2@ag-(*SdVoZ>_)!BgSj_bA!Q>4uK$Ynvap`8LjA!IT#!r zE_t2rMhPi)K!2sqRodk5`LUX6Ix!C`%@55Jkh3nnKVDRHMZK{8+or*QeABH}mvHmpPTmg#t`oRpY6M_y7 z3|Y@7zS+gQpifQ=JlIV*t7q`aiq7XJA9+TUlr*T`(w7+s0+o-b*8$7GbAj8zC$#Cm1m8Y5QI;jOXpP}C^yka>sez;QWV#;z`U_!zJD0*|7n4X~QOUL+WB-S| z2W9*?U3v{IG#g3;=sJd)Ap8GQud{jzR;|AAF%3hEm<)w+o*;}AVJ#o?^C-ApLpE?m zslus#Kt{JJWzLV zzMs)tN1^~wiUq1Um(Xj{Uq2~G{K=01Cg$_erdgW#PzI~wU{3hAVr9q z#r(377_tyo`X02daRwg#ALnu&rnu$z+p_%M7jJ^bX5pm%i!Nd9l>~hjnaHpccf_!lv#oe@I-4C zNyZyK;S%OvQ}@Yfc<#8r1Y~ZpggWF(7rPNH9taubaJ$k2qO6%FA0Y^SJDVtNwiEmP zI76ER+3`!eoqiyI+YVZG3!oq0sq)6#L<0G@x@YTR-#gOvq;-kHuhC?Co3?jcE{c8( zxLVr~ATsuxAGLeq!_Vc~s zPFIQV!=wt@cGN)d+VI8Upc5Ji74^fqZ6p%_un7TCGVDNejc<@IC#VKw+Mfe;j8|`Q zz7`076rP|F{v`Sq_ncUPcU%tRIh|jnga_Xm5q)!HnZzd1PYKpZRi_K|I<96QdK_x) zT^DcaW2h^C;4-@2TzO{1Cf^>;6 z&~+3IRoZCl_LewjU)yFpH$UV-EkBVqpLooGeUTq$KRC?~6c-E!j_)`X^4;Bug-*UQ5duJb1j$6w}Gj14(e&1Xckf+q|)3;dhTr4m(N2JS?P0D;=T z%X48vg%8uXX`y70e0jjo9&v5Vz)KcD15W8{ z%TKp%iH1n+R#()`O}6Fb)fqIb50;OW-amOAWXkjEc|FB!HgNjaK-@4q;r0W%*0YX9 zNU$EqN%YdrsmQ0EBdhO{I&@;_ajLCR(Z$rK6C@#NiR`pBYU z$jDk1r#Uf7EP2&y0}^rm?yf1>tXJf#*pST+*@rp_Hc`X zmI)(w-G3oFvmMZ$&OL$zAHW4j|9TUvB$I^C+bc z2ai%xZ2mfaZR4+&1LaQ~X!QryaGl8d=Ay2IiT*C0-g~19$yc&=>JASIHJi{*VB0%& z*thiH@E}v|7*f5NID#duN6%JV&%6031-;egI0{w0Na*!#CyMbt*sR{B707*NJDfeW z^HQCRJ$Sv|2CLTj7aN^}Ql2;p44d8<+@ELlNV>MVHSW6+Y4@_8XXd*%GdimhhD zIkT1;s2B*GP15cvQddB380M8XqaZ8{mRiS(vx_0c61~g3p)D-cm)y_$YCf z%JpZ)Z6pZE!)qs#hxY0eSAfvj;jgH7D8^@K{g&A%*&ru-K-fzB@k?O z#4?_Qi`Hy@Qpi!zW`qfn?KVaD<@8L|qRwWfuh7^6K9kq86;(?!*%;EAZHx2Hl2xEG zkqBU8k2(u`d$<(c(Uzj-+)g=CG`4P%72rxZk;c$2?W@KpXyEuInC`6%(&IX(a*n6w zAgT54s88Bq4;NN?-XtV8Plm-PAISZEfg~b61fw&M5|#aZ1PhrU<}iyoRNqCG1F0 z#4dy_>lapcUecy|`5E65u=a+@HTg#8OB7+JJARy&s`l=;3I)6dQ)^=S@XBkdN2;q^MaT;;`lsCNMx|ZwV=_5QlFQa^tE+&_wLDGCc zIo0gmOl*-tw)2F^@EIEI1z*w42`*=V_JGf<}i4)Ea z#5q!Qa4>3jp8ZD4CqVWuW_0MJ8A_d=VH4Km61Gn2M&_K_cvMYZeP$%b!btUG-T=HBcc z0Y4PeG`>N|y+Bf*`B9M6D5d`G)yU_L!Jpo(d?KzwyV3XM3;ynfsI4f>=4e`2YwjAV zyEr6=exJerYQv|QIOBr3z1CQ5WeZyPV!50YGQ|Ov(ik#%(k9tk59OK5k1AH|9g$Np z-M-0m$m^rL6(4JIRy;pc!A|d5AyE~b*^hSWao$;i6?RL*!{R`6F zixHhz71611p(@;fI%o74y{G!6nGDr-+&)Uw$HgB}G8L*3(Wi#Eh1Podmlb zZ=g}7Zks>2-7k_Q{bm{pz3t&5HN>~Y*g;;rT!nWK>C-Xeq=~J(G}ObQf5IC2%wd;* zo+PzO{bn@U*?-CE`x>S8_)14LZTr0!OgV5o0nBuNeR-Yl$Y zGJ-MFGEYL!f;xkt)s<;bpOs1O2YxW1L$_jJm?`fTR54rvw|NYJdfImEhIQRGvf#+UT zdKOUtc8@qsz1mu_m53jgD9E5c?D;ARoQcDB@&2ho)==n+qR}Pal!Z#|@c7f`E4Nve zdKDeL3aEW=sh}=HIqLh_l=SVITWXAC8s>x}4IK5sCA3=$j-6=U!o)?T+jn-43c=dS+0B#Y7Yxb|DttnORBwp!39%ik*k=Nk`chLIf`{iq@qON2#NU#m%8KNl@=CK7nzp{Gn<|kkuaM4KJ7Ok`&0{VSeFD|cV)iutY238W=! z`B#Mp_w2?z`p}_%C6X+jn!~7owcisMN8vo9cpXrj4!Xw_DX`JTYH9k-(06dvGaz_JRrSH``oLHO=bJ%BM}IW)e4;pA%f?# z*`>=td-S)ESe4_!&exOOFD}-C`YxjDxG}GRjQBvTm z(OGl%_9vELM~=SPU5o7dC{8+bI_QhEd}V)*mvr`uAJlFhY##As(RbIo&61B65j-nee z9X=OGQU^6=(_VvGNvrcXN3{K1KNu|27(3RxF@#r;lW6DL-b)QRRV+7?GM=fPGVl~e zHtsOVHweZ9xG?nlq`yZU_wbZNfm14(2g4r!W|{QmHXS2pPqOgMu^naVSdG+xN&_~n zx$0KmpLkfkA@_#=e7bz>ypflx(k-#arW#%&S4_Ws#7*BW2nTK(!^C6@E#~szS7~Yd z^s#Ow%xYm8${zeNL5PdwvbI~^TL$KKwx>*&=}WMSju=&50%wI%$Fa-3zTPG$<4jmb zd6t#S^2p&E60cTV*w3$z=C8c3dz#6Nc!AUVN9bu=%R_sbz4K7f8n>ucf(hr-zNao`(h4bz%p9b(S@q!RT|+))#< zb8t>Cj^X?}$)-0<5&#f2Y=j(z|AA)f84#IiWh6>aok`o-Ae*KYv5kGm>LW(Fc6MULZFIoA|z)6BD?J zpLmx!yx(Dz5F2I=H`Nah`d3n3?C7JImMcUH3gSkHt@U|`96|3zF6O>p^2XF;z|hRaG;$Gf?**%ekuxEMgWE~jnN`3L>w;55;|k5|+>puMeODcr z9S78ER9kCS9;D+!@*mtyUr)@rwGtcH9aHX-9;5oL=gG!1O1dr0DN`8BqLvtKU$n|49mt_#*Ke}p?0 zLb#wh*OQ5wSheo(+pUsFdr}=nveq5`{>$EzC+P$m zQudJ{s?~>jC-8P%=Ggc$f3Ci1gfRt<^{N!F&1Y;{>_QEwbwV3&ne1GY@EbTuQs8Ue zl^4^dF|y|J!f)+B=^jyu6te4PuDG$OElCLdu;+e*GAbkR_`ycBaYEp-&$Z0?(awXov-GL<9s_CumTCZ?M3uc6Q;hUZe+h+B4@> zS>-mQuh+9n@8`LUaH*EW-AP9eK4HAE`=X7{@%ai6Ln~aQ7G8N!d@DZ2I*@I@;+@Qw zI>`8X%x@#Fu;>%LoH?`VS;#%Y-C9Pgs+NOgX9?eAx~w~MOTyoyKh&&Rt0YH3z4jTP zLQ(1@p;B6|wkB+6kf>&gECBjhUCqXqJkm9ytgtbnng#M@0$hwWrRVTeBX(7&7q&|aITU(2x|3;q<>|{NpnYJ zazhMEF@5uL;1&#ZJgU!=C~j*>q{LMzbRGe^#6z_BsIYN$@FDnLsWf#lG;AVBlpV=i zqAP^F4qFtxc|&|2!x1k*QpN>cP%67Rt(RAK#5)j|)8+`wjH_~=Q~6K*$Y~SjsnH-eX0?PV9oo1I8Bs{E{{mgSHz8Q}<>w`yr%4e$FVE`#ir*4_ zIz;=q5-f2ahk&lw$|?N|@|dIKh{TYFUCk*gH&@7E!16y}UOEhS5x!R0!V;rBlm*xgE`jWI`c=+H6b@c#(raDX9|i2vra31~bXpKy%%i!n$c!9y+o zC6x%~G~$mplf*1{{LWk9=sbxcE^~g8)K+x%ui$X$4>(HXtq(5!iNtmsNqzVlVeubi zk@x`j|4y<0&x@ZXc_W`s0GL*m3?UG#wZr_27=mZ Date: Thu, 4 Jan 2024 23:27:14 +0100 Subject: [PATCH 03/10] Initialize DVC --- .dvc/.gitignore | 3 +++ .dvc/config | 0 .dvcignore | 3 +++ 3 files changed, 6 insertions(+) create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..e69de29 diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore From 44c79e08c1c9205690c92da4db017801368ae38a Mon Sep 17 00:00:00 2001 From: Yurii Havrylko Date: Thu, 4 Jan 2024 23:30:10 +0100 Subject: [PATCH 04/10] Add raw data --- data/.gitignore | 1 + data/data.csv.dvc | 5 +++++ 2 files changed, 6 insertions(+) create mode 100644 data/.gitignore create mode 100644 data/data.csv.dvc diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..3e63a77 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/data.csv diff --git a/data/data.csv.dvc b/data/data.csv.dvc new file mode 100644 index 0000000..41d07a2 --- /dev/null +++ b/data/data.csv.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 7ec83b215d1790bedaf458a1690370e3 + size: 25144581 + hash: md5 + path: data.csv From 18fd053a8b2b8bbc7a3b25878ad23576e15cab5a Mon Sep 17 00:00:00 2001 From: Yurii Havrylko Date: Thu, 4 Jan 2024 23:32:55 +0100 Subject: [PATCH 05/10] configure remote --- .dvc/config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.dvc/config b/.dvc/config index e69de29..dd9a124 100644 --- a/.dvc/config +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + remote = minio +['remote "minio"'] + url = s3://ml-data + endpointurl = http://10.0.0.6:9000 From 6ab464b2b3dee0057f3f75a0884e75a0b51128fc Mon Sep 17 00:00:00 2001 From: Yurii Havrylko Date: Thu, 4 Jan 2024 23:40:55 +0100 Subject: [PATCH 06/10] added readme instructions --- README.md | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/README.md b/README.md index a67ba1f..0d05117 100644 --- a/README.md +++ b/README.md @@ -22,3 +22,45 @@ DH Images: Works on push to master/feature* ![Alt text](assets/actions.png) +### DVC + +Install DVC + +``` +brew install dvc +``` + +Init in repo + +``` +dvc init --subdir +git status +git commit -m "init DVC" +``` + +Move file with data and add to DVC, commit DBV data config + +``` +dvc add ./data/data.csv +git add data/.gitignore data/data.csv.dvc +git commit -m "create data" +``` + +Add remote data storage and push DVC remote config +(ensure that bucket already created) + +``` +dvc remote add -d minio s3://ml-data +dvc remote modify minio endpointurl [$AWS_ENDPOINT](http://10.0.0.6:9000) + +git add .dvc/config +git commit -m "configure remote" +git push +``` + +Upload data +``` +export AWS_ACCESS_KEY_ID='...' +export AWS_SECRET_ACCESS_KEY='...' +dvc push +``` From f73fe8ca25bd63b4e7d323ac237eb4cc31bd47c3 Mon Sep 17 00:00:00 2001 From: Yurii Havrylko Date: Mon, 8 Jan 2024 01:46:00 +0100 Subject: [PATCH 07/10] training notebook --- app/requirements-dev.txt | 3 +++ app/requirements.txt | 1 + experiments/train.ipynb | 1 + 3 files changed, 5 insertions(+) create mode 100644 app/requirements-dev.txt create mode 100644 experiments/train.ipynb diff --git a/app/requirements-dev.txt b/app/requirements-dev.txt new file mode 100644 index 0000000..a2c36c9 --- /dev/null +++ b/app/requirements-dev.txt @@ -0,0 +1,3 @@ +ipykernel==6.28.0 +wandb==0.16.1 +evaluate==0.4.1 diff --git a/app/requirements.txt b/app/requirements.txt index 67d6f4c..3ad1e0c 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1,2 +1,3 @@ flask==3.0.0 gunicorn==21.2.0 +transformers==4.36.2 diff --git a/experiments/train.ipynb b/experiments/train.ipynb new file mode 100644 index 0000000..45cb365 --- /dev/null +++ b/experiments/train.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"code","execution_count":58,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:38:24.348418Z","iopub.status.busy":"2024-01-08T00:38:24.347727Z","iopub.status.idle":"2024-01-08T00:38:24.354720Z","shell.execute_reply":"2024-01-08T00:38:24.353625Z","shell.execute_reply.started":"2024-01-08T00:38:24.348385Z"},"trusted":true},"outputs":[],"source":["from datasets import load_dataset\n","from transformers import (\n"," BertForSequenceClassification,\n"," BertTokenizer,\n"," TrainingArguments,\n"," Trainer,\n"," pipeline\n",")\n","from functools import partial\n","import numpy as np\n","import evaluate\n","import wandb"]},{"cell_type":"markdown","metadata":{},"source":["## Load dataset"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-01-07T23:33:19.657359Z","iopub.status.busy":"2024-01-07T23:33:19.657062Z","iopub.status.idle":"2024-01-07T23:33:23.931368Z","shell.execute_reply":"2024-01-07T23:33:23.930520Z","shell.execute_reply.started":"2024-01-07T23:33:19.657337Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"335e4af20627429f823b7763046054fc","version_major":2,"version_minor":0},"text/plain":["Downloading: 0%| | 0.00/1.03k [00:00"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["Run data is saved locally in /kaggle/working/wandb/run-20240107_235614-88rzow23"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["Syncing run graceful-breeze-1 to Weights & Biases (docs)
"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":[" View project at https://wandb.ai/yurii-havrylko/huggingface"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":[" View run at https://wandb.ai/yurii-havrylko/huggingface/runs/88rzow23"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"name":"stderr","output_type":"stream","text":["/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n"]},{"data":{"text/html":["\n","
\n"," \n"," \n"," [1500/1500 25:15, Epoch 3/3]\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
EpochTraining LossValidation LossAccuracy
10.0740000.0277870.986500
20.0326000.0109200.995000
30.0101000.0027390.999500

"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"name":"stderr","output_type":"stream","text":["/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n","/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n","/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n","/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n","/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n"]},{"data":{"text/plain":["TrainOutput(global_step=1500, training_loss=0.038901503562927243, metrics={'train_runtime': 1572.7041, 'train_samples_per_second': 15.26, 'train_steps_per_second': 0.954, 'total_flos': 6314665328640000.0, 'train_loss': 0.038901503562927243, 'epoch': 3.0})"]},"execution_count":32,"metadata":{},"output_type":"execute_result"}],"source":["trainer.train()"]},{"cell_type":"code","execution_count":42,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:27:04.256277Z","iopub.status.busy":"2024-01-08T00:27:04.255502Z","iopub.status.idle":"2024-01-08T00:27:04.261328Z","shell.execute_reply":"2024-01-08T00:27:04.260224Z","shell.execute_reply.started":"2024-01-08T00:27:04.256246Z"},"trusted":true},"outputs":[],"source":["PATH = \"working/checkpoints\""]},{"cell_type":"code","execution_count":49,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:34:03.559698Z","iopub.status.busy":"2024-01-08T00:34:03.559317Z","iopub.status.idle":"2024-01-08T00:34:04.650001Z","shell.execute_reply":"2024-01-08T00:34:04.648660Z","shell.execute_reply.started":"2024-01-08T00:34:03.559669Z"},"trusted":true},"outputs":[{"data":{"text/plain":["('/kaggle/working/checkpoints/tokenizer_config.json',\n"," '/kaggle/working/checkpoints/special_tokens_map.json',\n"," '/kaggle/working/checkpoints/vocab.txt',\n"," '/kaggle/working/checkpoints/added_tokens.json')"]},"execution_count":49,"metadata":{},"output_type":"execute_result"}],"source":["trainer.save_model(PATH)\n","bert_tokenizer.save_pretrained(PATH)"]},{"cell_type":"markdown","metadata":{},"source":["## Usage"]},{"cell_type":"code","execution_count":52,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:34:22.825288Z","iopub.status.busy":"2024-01-08T00:34:22.824467Z","iopub.status.idle":"2024-01-08T00:34:23.163749Z","shell.execute_reply":"2024-01-08T00:34:23.162693Z","shell.execute_reply.started":"2024-01-08T00:34:22.825254Z"},"trusted":true},"outputs":[],"source":["tokenizer = BertTokenizer.from_pretrained(PATH, local_files_only=True)\n","model = BertForSequenceClassification.from_pretrained(PATH, local_files_only=True)\n"]},{"cell_type":"code","execution_count":53,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:36:16.296584Z","iopub.status.busy":"2024-01-08T00:36:16.295914Z","iopub.status.idle":"2024-01-08T00:36:16.303129Z","shell.execute_reply":"2024-01-08T00:36:16.301783Z","shell.execute_reply.started":"2024-01-08T00:36:16.296552Z"},"trusted":true},"outputs":[],"source":["text = \"\"\"\n","Liverpool struck twice late on to beat Arsenal at Emirates Stadium and reach the FA Cup fourth round.\n","\n","Arsenal paid the price for missing a host of opportunities and were punished as Liverpool grew increasingly dangerous, the deadlock broken when Trent Alexander-Arnold's free-kick glanced in off Jakub Kiwior's head with seven minutes left.\n","\n","Liverpool's triumph was completed in the closing seconds as a lethal break ended with Luis Diaz firing an emphatic finish high past Arsenal goalkeeper Aaron Ramsdale.\n","\n","Mikel Arteta's side dominated the first half, with Martin Odegaard hitting the bar while Reiss Nelson and Kai Havertz also had chances to give Arsenal reward for their pressure.\n","\n","Liverpool, despite missing captain Virgil van Dijk through illness and with Mohamed Salah at the Africa Cup of Nations, held firm and were always a threat. Alexander-Arnold hit the bar in the first half and as they grew into the game, Ramsdale saved well from Diaz and Diogo Jota headed against the woodwork.\n","\"\"\""]},{"cell_type":"code","execution_count":59,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:39:01.210310Z","iopub.status.busy":"2024-01-08T00:39:01.209908Z","iopub.status.idle":"2024-01-08T00:39:01.695811Z","shell.execute_reply":"2024-01-08T00:39:01.694120Z","shell.execute_reply.started":"2024-01-08T00:39:01.210282Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["/opt/conda/lib/python3.10/site-packages/transformers/pipelines/text_classification.py:105: UserWarning: `return_all_scores` is now deprecated, if want a similar functionality use `top_k=None` instead of `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.\n"," warnings.warn(\n"]}],"source":["text_classification_pipeline = pipeline(\n"," \"text-classification\",\n"," model=PATH,\n"," tokenizer=PATH,\n"," return_all_scores=True,\n"," device=0,\n",")"]},{"cell_type":"code","execution_count":60,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:39:26.256297Z","iopub.status.busy":"2024-01-08T00:39:26.255624Z","iopub.status.idle":"2024-01-08T00:39:26.314708Z","shell.execute_reply":"2024-01-08T00:39:26.313773Z","shell.execute_reply.started":"2024-01-08T00:39:26.256264Z"},"trusted":true},"outputs":[{"data":{"text/plain":["[[{'label': 'LABEL_0', 'score': 0.9968350529670715},\n"," {'label': 'LABEL_1', 'score': 0.0031650131568312645}]]"]},"execution_count":60,"metadata":{},"output_type":"execute_result"}],"source":[" text_classification_pipeline(text)"]}],"metadata":{"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30627,"isGpuEnabled":true,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.11"}},"nbformat":4,"nbformat_minor":4} From b4331d80fd382bd5c2cc1dfa8b1d1ab537cc9f86 Mon Sep 17 00:00:00 2001 From: Yurii Havrylko Date: Mon, 8 Jan 2024 01:49:03 +0100 Subject: [PATCH 08/10] add model card --- modelcard.md | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 modelcard.md diff --git a/modelcard.md b/modelcard.md new file mode 100644 index 0000000..586a8d0 --- /dev/null +++ b/modelcard.md @@ -0,0 +1,63 @@ +--- +language: en +tags: +- bert +license: apache-2.0 +datasets: +- GonzaloA/fake_news +--- + +# BERT fake news classifiction model + +Pretrained model on English language based on uncased version of BERT finetuned for task of binary classification. + + +### How to use + +You can use this model directly with a pipeline for masked language modeling: + +```python + +tokenizer = BertTokenizer.from_pretrained(PATH, local_files_only=True) +bert_model = BertForSequenceClassification.from_pretrained(PATH, local_files_only=True) + +# run infernce + +``` +With transformers pipeline + +```python + +text_classification_pipeline = pipeline( + "text-classification", + model=PATH, + tokenizer=PATH, + return_all_scores=True +) +``` + + +## Training data + +The BERT model was pretrained on [bert-base-uncased](https://huggingface.co/bert-base-uncased), a dataset consisting of ~25,000 of news labeled as fake and real. +For training purpoose 10k of samples randomly selected and splitted in 80:20 ratio. + +## Training procedure + +### Preprocessing + +The texts are tokenized using BERT tokenizer. + +### Training + +The model was trained on GPU T4 x 2. + +## Evaluation results + + +| Epoch | Training Loss | Validation Loss | Accuracy | +|-------|---------------|-----------------|----------| +| 1 | 0.074000 | 0.027787 | 0.986500 | +| 2 | 0.032600 | 0.010920 | 0.995000 | +| 3 | 0.010100 | 0.002739 | 0.999500 | + From 3f001a633ffe852a3741071bd8fe82603baac2c8 Mon Sep 17 00:00:00 2001 From: Yurii Havrylko Date: Sun, 11 Feb 2024 23:37:10 +0100 Subject: [PATCH 09/10] L6T1: Test code and set up CI tests (#10) * training code as script * test code * install dev reqi * add CI step * fix python version * fix path to * change app path * split tests and publish * add nessesary init files * fix path to tests * add missing deps * add wandb --- .github/workflows/ci.yml | 22 +++++++ app/requirements-dev.txt | 11 +++- app/src/model/training.py | 116 +++++++++++++++++++++++++++++++++++ app/tests/__init__.py | 0 app/tests/model/__init__.py | 0 app/tests/model/test_code.py | 11 ++++ 6 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 app/src/model/training.py create mode 100644 app/tests/__init__.py create mode 100644 app/tests/model/__init__.py create mode 100644 app/tests/model/test_code.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c4608a9..636e114 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,29 @@ on: workflow_dispatch: jobs: + tests: + runs-on: ubuntu-latest + steps: + - name: 'Checkout GitHub Action' + uses: actions/checkout@main + + - name: 'Set up Python' + uses: actions/setup-python@v2 + with: + python-version: '3.8' + + - name: 'Install dependencies' + run: | + python -m pip install --upgrade pip + pip install -r app/requirements-dev.txt + + - name: 'Run pytest' + run: | + cd app/ + pytest tests/ + push-image: + needs: tests runs-on: ubuntu-latest steps: - name: 'Checkout GitHub Action' diff --git a/app/requirements-dev.txt b/app/requirements-dev.txt index a2c36c9..6750fa7 100644 --- a/app/requirements-dev.txt +++ b/app/requirements-dev.txt @@ -1,3 +1,10 @@ -ipykernel==6.28.0 -wandb==0.16.1 +-r requirements.txt + evaluate==0.4.1 +great-expectations==0.18.7 +pytest==7.4.4 +scikit-learn==1.3.2 +accelerate==0.25.0 +datasets==2.16.1 +wandb==0.16.1 +ipykernel==6.28.0 diff --git a/app/src/model/training.py b/app/src/model/training.py new file mode 100644 index 0000000..41b40d4 --- /dev/null +++ b/app/src/model/training.py @@ -0,0 +1,116 @@ +import argparse +import numpy as np +from datasets import load_dataset +from transformers import (BertForSequenceClassification, BertTokenizer, TrainingArguments, Trainer) +from transformers import DataCollatorWithPadding +import evaluate +import wandb + +MODEL_NAME = "bert-base-uncased" +SEED = 42 +TRAIN_SIZE = 8000 +EVAL_SIZE = 2000 +DATASET_NAME = "GonzaloA/fake_news" + +def parse_args(args=None): + parser = argparse.ArgumentParser(description="Train BERT for sequence classification.") + parser.add_argument('--train_size', type=int, default=TRAIN_SIZE, + help='Number of samples to use for training') + parser.add_argument('--eval_size', type=int, default=EVAL_SIZE, + help='Number of samples to use for evaluation') + return parser.parse_args(args=None) + +def load_data(dataset_name=DATASET_NAME): + """Loads a dataset using Huggingface's datasets library.""" + dataset = load_dataset(dataset_name) + + wandb.log({"dataset": dataset_name}) + + return dataset + +def tokenize_data(tokenizer, dataset, padding=True, truncation=True, max_length=512): + def tokenize_function(examples): + return tokenizer(examples["text"], padding=padding, truncation=truncation, max_length=max_length) + + return dataset.map(tokenize_function, batched=True) + +def configure_training_args(output_dir="test_trainer"): + """Sets up the training arguments for the Trainer.""" + return TrainingArguments( + output_dir=output_dir, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_dir=f"{output_dir}/logs", # directory for storing logs + logging_steps=10, + seed=SEED, + ) + +def compute_metrics(eval_pred): + """Computes accuracy of the model predictions.""" + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return evaluate.load("accuracy").compute(predictions=predictions, references=labels) + +def prepare_datasets(tokenized_datasets, args): + """Prepare the training and evaluation datasets from tokenized data.""" + train_dataset = tokenized_datasets.select(range(args.train_size)) + eval_dataset = tokenized_datasets.select(range(args.train_size, args.train_size + args.eval_size)) + return train_dataset, eval_dataset + +def train_model(model, tokenizer, train_dataset, eval_dataset): + """Initialize the Trainer and train the model.""" + training_args = configure_training_args() + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, + data_collator=data_collator, + ) + + trainer.train() + return trainer + +def initialize_wandb(args): + """Initialize Weights & Biases.""" + wandb.init(project="bert_fake_news_classification", entity="your_wandb_username", config=args) + +def save_model_and_tokenizer(trainer, tokenizer, path="./model_checkpoint"): + """Save the trained model and tokenizer.""" + trainer.save_model(path) + tokenizer.save_pretrained(path) + return path + +def log_to_wandb(dataset_name, artifact_path): + """Log dataset and model artifact to Weights & Biases.""" + wandb.log({"dataset": dataset_name}) + wandb.log_artifact(artifact_path, type="model", name="bert_fake_news_classifier") + +def finish_wandb(): + """Finish the Weights & Biases run.""" + wandb.finish() + +def main(): + args = parse_args() + initialize_wandb(args) + + datasets = load_data() + small_train_dataset = datasets["train"].shuffle(seed=SEED).select(range(args.train_size + args.eval_size)) + model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) + tokenizer = BertTokenizer.from_pretrained(MODEL_NAME) + + tokenized_datasets = tokenize_data(tokenizer, small_train_dataset) + tokenized_datasets = tokenized_datasets.remove_columns(["text"]) + + train_dataset, eval_dataset = prepare_datasets(tokenized_datasets, args) + trainer = train_model(model, tokenizer, train_dataset, eval_dataset) + + artifact_path = save_model_and_tokenizer(trainer, tokenizer) + log_to_wandb(DATASET_NAME, artifact_path) + finish_wandb() + +if __name__ == "__main__": + main() diff --git a/app/tests/__init__.py b/app/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/tests/model/__init__.py b/app/tests/model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/tests/model/test_code.py b/app/tests/model/test_code.py new file mode 100644 index 0000000..e5022c2 --- /dev/null +++ b/app/tests/model/test_code.py @@ -0,0 +1,11 @@ +import numpy as np +from src.model.training import compute_metrics + +def test_compute_metrics(): + mock_logits = np.array([[2, 0.1], [0.1, 2], [2, 0.1]]) + mock_labels = np.array([0, 1, 0]) + + output = compute_metrics((mock_logits, mock_labels)) + + expected_accuracy = 1.0 + assert output['accuracy'] == expected_accuracy, f"Expected accuracy: {expected_accuracy}, but got: {output['accuracy']}" From ab633c0c3f53882411c4c896de6b0ecc86325316 Mon Sep 17 00:00:00 2001 From: Yurii Havrylko Date: Sun, 11 Feb 2024 23:38:47 +0100 Subject: [PATCH 10/10] add publish function to wandb (#13) --- app/src/helpers/wandb_registry.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 app/src/helpers/wandb_registry.py diff --git a/app/src/helpers/wandb_registry.py b/app/src/helpers/wandb_registry.py new file mode 100644 index 0000000..0b65c6b --- /dev/null +++ b/app/src/helpers/wandb_registry.py @@ -0,0 +1,9 @@ +import wandb + +def publish_model(model_path, project, name, model_type="model"): + run = wandb.init(project=project, job_type="model-publishing") + artifact = wandb.Artifact(name, type=model_type) + artifact.add_dir(model_path) + run.log_artifact(artifact) + run.finish() + print(f"Published {name} to W&B")