From eb929c5c2c25f77811e65e1fc2dce0cb2cea27a3 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 18 Jan 2024 13:26:33 +0100 Subject: [PATCH 01/17] Optional meta field for UnstructuredFileConverter with proper tests --- .../converters/unstructured/converter.py | 25 ++++++--- .../unstructured/tests/test_converter.py | 52 +++++++++++++++++++ 2 files changed, 71 insertions(+), 6 deletions(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index 92348e6cd..a4fab0de5 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -8,6 +8,7 @@ from typing import Any, Dict, List, Literal, Optional, Union from haystack import Document, component, default_to_dict +from haystack.components.converters.utils import normalize_metadata from tqdm import tqdm from unstructured.documents.elements import Element # type: ignore[import] @@ -89,12 +90,17 @@ def to_dict(self) -> Dict[str, Any]: ) @component.output_types(documents=List[Document]) - def run(self, paths: Union[List[str], List[os.PathLike]]): + def run(self, paths: Union[List[str], List[os.PathLike]], meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None): """ Convert files to Haystack Documents using the Unstructured API (hosted or running locally). :param paths: List of paths to convert. Paths can be files or directories. If a path is a directory, all files in the directory are converted. Subdirectories are ignored. + :param meta: Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. + Defaults to `None`. """ unique_paths = {Path(path) for path in paths} @@ -107,9 +113,10 @@ def run(self, paths: Union[List[str], List[os.PathLike]]): # currently, the files are converted sequentially to gently handle API failures documents = [] + meta_list = normalize_metadata(meta, sources_count=len(all_filepaths)) - for filepath in tqdm( - all_filepaths, desc="Converting files to Haystack Documents", disable=not self.progress_bar + for filepath, metadata in tqdm( + zip(all_filepaths, meta_list), desc="Converting files to Haystack Documents", disable=not self.progress_bar ): elements = self._partition_file_into_elements(filepath=filepath) docs_for_file = self._create_documents( @@ -117,6 +124,7 @@ def run(self, paths: Union[List[str], List[os.PathLike]]): elements=elements, document_creation_mode=self.document_creation_mode, separator=self.separator, + meta=metadata, ) documents.extend(docs_for_file) @@ -128,6 +136,7 @@ def _create_documents( elements: List[Element], document_creation_mode: Literal["one-doc-per-file", "one-doc-per-page", "one-doc-per-element"], separator: str, + meta: Optional[Dict[str, Any]] = None, ) -> List[Document]: """ Create Haystack Documents from the elements returned by Unstructured. @@ -136,13 +145,16 @@ def _create_documents( if document_creation_mode == "one-doc-per-file": text = separator.join([str(el) for el in elements]) - docs = [Document(content=text, meta={"name": str(filepath)})] + metadata = meta + metadata["name"] = str(filepath) + docs = [Document(content=text, meta=metadata)] elif document_creation_mode == "one-doc-per-page": texts_per_page: defaultdict[int, str] = defaultdict(str) meta_per_page: defaultdict[int, dict] = defaultdict(dict) for el in elements: - metadata = {"name": str(filepath)} + metadata = meta + metadata["name"] = str(filepath) if hasattr(el, "metadata"): metadata.update(el.metadata.to_dict()) page_number = int(metadata.get("page_number", 1)) @@ -154,7 +166,8 @@ def _create_documents( elif document_creation_mode == "one-doc-per-element": for el in elements: - metadata = {"name": str(filepath)} + metadata = meta + metadata["name"] = str(filepath) if hasattr(el, "metadata"): metadata.update(el.metadata.to_dict()) if hasattr(el, "category"): diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index b0473df25..dc20b809c 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -97,3 +97,55 @@ def test_run_one_doc_per_element(self, samples_path): # elements have a category attribute that is saved in the document meta assert "category" in doc.meta + + @pytest.mark.integration + def test_run_one_doc_per_file_with_meta(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + meta = {"custom_meta": "foobar"} + local_converter = UnstructuredFileConverter( + api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-file" + ) + + documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"] + + assert len(documents) == 1 + assert documents[0].meta["name"] == str(pdf_path) + assert "custom_meta" in documents[0].meta + assert documents[0].meta["custom_meta"] == "foobar" + assert documents[0].meta == {"name": str(pdf_path), "custom_meta": "foobar"} + + @pytest.mark.integration + def test_run_one_doc_per_page_with_meta(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + meta = {"custom_meta": "foobar"} + local_converter = UnstructuredFileConverter( + api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-page" + ) + + documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"] + + assert len(documents) == 4 + for i, doc in enumerate(documents, start=1): + assert doc.meta["name"] == str(pdf_path) + assert doc.meta["page_number"] == i + assert "custom_meta" in doc.meta + assert doc.meta["custom_meta"] == "foobar" + @pytest.mark.integration + def test_run_one_doc_per_element_with_meta(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + meta = {"custom_meta": "foobar"} + local_converter = UnstructuredFileConverter( + api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element" + ) + + documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"] + + assert len(documents) > 4 + for doc in documents: + assert doc.meta["name"] == str(pdf_path) + assert "page_number" in doc.meta + + # elements have a category attribute that is saved in the document meta + assert "category" in doc.meta + assert "custom_meta" in doc.meta + assert doc.meta["custom_meta"] == "foobar" \ No newline at end of file From a0da2e82349945c7cc42a8f301f68e7a052e38ad Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 18 Jan 2024 13:31:53 +0100 Subject: [PATCH 02/17] black lint --- .../components/converters/unstructured/converter.py | 6 +++++- integrations/unstructured/tests/test_converter.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index a4fab0de5..c9db4eea3 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -90,7 +90,11 @@ def to_dict(self) -> Dict[str, Any]: ) @component.output_types(documents=List[Document]) - def run(self, paths: Union[List[str], List[os.PathLike]], meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None): + def run( + self, + paths: Union[List[str], List[os.PathLike]], + meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + ): """ Convert files to Haystack Documents using the Unstructured API (hosted or running locally). diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index dc20b809c..7b34e5552 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -130,6 +130,7 @@ def test_run_one_doc_per_page_with_meta(self, samples_path): assert doc.meta["page_number"] == i assert "custom_meta" in doc.meta assert doc.meta["custom_meta"] == "foobar" + @pytest.mark.integration def test_run_one_doc_per_element_with_meta(self, samples_path): pdf_path = samples_path / "sample_pdf.pdf" @@ -148,4 +149,4 @@ def test_run_one_doc_per_element_with_meta(self, samples_path): # elements have a category attribute that is saved in the document meta assert "category" in doc.meta assert "custom_meta" in doc.meta - assert doc.meta["custom_meta"] == "foobar" \ No newline at end of file + assert doc.meta["custom_meta"] == "foobar" From 4a90be7f7d0c769e9b84ccfecfac327a51a3d861 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 18 Jan 2024 14:20:08 +0100 Subject: [PATCH 03/17] Adding multiple files and meta list test case --- .../tests/samples/sample_pdf2.pdf | Bin 0 -> 21457 bytes .../unstructured/tests/test_converter.py | 19 ++++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 integrations/unstructured/tests/samples/sample_pdf2.pdf diff --git a/integrations/unstructured/tests/samples/sample_pdf2.pdf b/integrations/unstructured/tests/samples/sample_pdf2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c304dc004c4b69df370815ba1ab827f3a76ce2aa GIT binary patch literal 21457 zcmdSAV|ZrGwk{l}gN|*dW81cE+qP|Ytd7m@*tXTNZTlp>*1Pt*&sqDd@6Y$+Tr*X3 z)IF+3;i)<6p1H;#mK7AHqN8SnB;MMdn1Q6D#iPZu(Km;rXQrcvG_p4Ql-d4?adJY^ zNE%t2IGW=9QPW7{F+kGrSvnfoTkAO*;n4^hIhz?6$&2tq(g>Sl zBS>y;NC!uIBRwlfSHR5gE{Y0EC?OMVENznr1bD3lLXuvi)6hfr49$3ri;F zUJK0tvM>H1-Hxh&zKpr5NN2}gnh1y^z4t(9aDXtXfSsGh#vmdOfr;OG}#FU2@}&VgkMN8 z+dUXGe!8qvc+{t(NbHf?40V;9xT%MZrbYFwviU9qNUUa+Y8rQ1fpvPSHos^3*s-*Z zg&Zw1D?yIC%Cjt6iB3Q)h6 zag7sfsU6?D1x*SSadKkm0GMq-O>w~Tob7Kj+!4%npE`2(tjtk}%Hu~UhRk3jv|CtI zC?PmH3JsGyVfcCv_rBf9W;Qm|Lz$7?iFz4ypd@#Y2oV@&+7iU+ry5Ib2pQ-zx9Nk% zObU+fooRt&>ZAkU6~rdnC-kD3;UQuiyCFM}=c8XiEPuP7Yw zcVKE!ATwT&T0ZVY07yQ%ML_2Ob6bE}Kle!xX@6#0FijAZ&abvm(7qzPkilINw18`V zW?%d-;E;dtmBymv!>01j!(pm_aTDN^Wkmw+69|iii|2iuBq>HC4=BwQkmb4|bi`=> zq80Fz13rnz2=)r-6$qaKG^6)*5#X2aQZ)zG7m@Cf4MA7r3~o$iD$yn|nWr-W6%U|tIH$Ep$m!sA=S5#$4v2&KisZPEe z^rLTws6}q-E9=eci_^}f7^EmmqV6#p!!h_h`WNWW)fB3dmh(76FvF)uQS~b8#^`EP zC)WTk>YZ?4!3=bwZ(-SbH==3MTQgOHG{7!Iw)h_ReBC6ubZsSHLpt||?Q*!#_r!09 z^uhAMS&zZ(?;wqZPzSN_hsFPnM`TH)2#4`y|I2G2d^e+J(?P?woD~aHl7YKY&?%_;x`f` z8F4Hj{v~NP@g`n3UN?aP`LD&+v#O;j&6(;lZGv9nUNj;J49VBY+d~xzWpNfs$H^YV zGKmYxF3CF7*`zx7KH_hMFO5KQUA$cietFf(xx$er-=*jG5BW++@>4rrRv##8~vum7GcSMo=G_DyTLtZyMZtwHqzMr zwu`lkv0FDR57teq`@kTNUJ#iO*%BFZ&A8Vzu|2WOTxF*H-b$AANN!E;nM9CeoCKH@ zQ!1%sGlw*nS!z`(GfQhuVye;BGF>zf4TW&?= zX%W>Fcm8S_SA{`wovKNmblo_jRy?%^3txj~gMtgcj0BA;jZT%^c`mnHc13o6he(Hp z&v54?-f!IqzVxo*-0<8Ey)3=FO#{NxxUwnoJoP+|epZ8DcHx)Y+ZrU*VVPp_Vo^g< z3X8LAin`4c^%Kw*d!{6eWlSvtKdM*M)XIhDbJ%(~2R-xL(jO_HxFS>|GEr7hbSO_L z6{wCWan;DwKvk{kkE)+~e)pzXXc>471e&%5Jj}%RY|UkUCPw)i!rlX*p?I{rG8AY9VUhx!0<9b&+f7wegt(RsuFo zBw`@3C}AIC0!lkHxk@EYrK%zRUjN+uyoj5fJB&Mu8?oKFz08Z|LGKml)%j`ldf{UK zY2jH0gcYP5;s7!qL>NRFoEwTB0s+(sq^Aq8tHmFeW?F|xa~1`N|3$z#h!cVdS_}Ef z0galHv7F?@tx?I<+tmexN?%A&ptnUdW^j5?Bq9n$0>wpmNTg3_PDDz`Lnv8DB3YGE zt!_s+Qauu{kP8JD6WN2v*}mj{dQ~E1`(z7p`;3}F^LyRNZ~LFSB_k7+u{vF9cH}si zxY+A(?_r4DCA*T9H#$u&yC4e*Eg{eG`~4LAQI}c|MlW$7$$_ za&a0&ywfT;+Yuhd*_l6KMRYTmqUKaR1&vl3mp7!e+cGQwQ94quy<%aN;Y z*EOv>-|OdZ;+NZOqZ-$MvFx)%wme-muB$L#PA1Tvy44@q3g40;UWq?mSMySNTphg& zJZ&quD|mLYq+MB0!%i30EXFXVp1!!Y zO}xrnWUlBa|Huy`CT*|n$fMyj^Y_5=LdUw5&kSG`a344~4iZ+Y&)~hRMFpQ#pWe!5 zdC$c0kE7-e6CWKP!Bf%8xCxOUk%EEl7^gXo*#(wjmPF?6sQCVxZMy-R*Ns~N55%%V zS%$o(2M|_#w#%)72i)^4XwD_KaIexs(%0;}Uz6(L>bdhP^I^a8naw;TJ=PC>XEL$J zv8i~{-#u?4?^4t1hD{_U;>M9P(%G_Iys!BCgDxY&P(#>N9J!n8ccwEdt1H(={F1A* zFPrR3mfN0Rl#^g=5boGzU!QLn;rfb3u* z^-QyN*X*?${ho;l90B%pM&0i5uD!x_=(YAb1zrNH%`@uG`fB=WaI*b_6jFvftCXkW zb?#xoxTNjoBxgZ)qAt`VCb&_6<~`}LN~Q;NqGWIxVk9zo zW;>{Z?CsZm(o|Wtx65;ziT+UUg~ilyDv$VDaXZ`7{GHXl;la*~i`jW$QTSR}JI|fp zYwokb%|P7YiOhpcQg#%Nizmd(l`Dyr@!tA<;Pvw-8~rDJ{3kD^WBeyEWus&L7g?oa z_&?d|AN@b6D;*vkGaU=_e`ct=9uSThb4hz1KBr3dCE`D6N>8hVO3CMb$SakI%bV2@ zNqnc0_%-s~(9kWeM#^9mFa$ts2IdgJ*#w^r!t8rt>Nu~!v|)y0+8}`>D=i$JSd0uw z0(89$bF+v0G*-8yDmzHhe#^T~LEmIX2BX9HDEjE!`%%@q&w0DYdCPsveH**eYDjc7 ziE|p+V&^6zc$Lw#4+g2hOM4GX2IOG&V^1S{RqNeB*wa2Ky23B)wx>dh-HB$h{^m6{ z=beooFR5(-7Mp0+gV}DVm74~n(yQNmlwT2_CN#aSY+pT2S#0045W#KAVk( zd4FBcp68^xC2CW6ud@nML-a(q`7)ZdvEFZstZWFeXM)1HB~@*+onC=yLlq!Kd`OU|s{v)L9h--stgwZO42N$v(Q;m zMAP5zQA!@TfwYn?Iz2=m>M1v8&hhm|z#uf%b-I5kwRh!Xuly#0;NZrC;P63(Xz?b6 zXi;HX#nXdzbxsIuD?`A!$S^1qhJwg$A2nR+Fz@N88q`<_%jg)lUx8V;HF5@8*w*Zt z*0ejMs&~1Rsg^!$ex0rgB#H>Q^UugpjzuN%|B)jti}{U+E5;>=gSw51bC@Iuw>t2G zs4|2Pc~axy1(-R^&KVNJ81<~LjVN6a%SfTIL+b(c#s&OrDrv1FK##(rxq_2}+haT=W7!x)oUMU^f z_q5nvA;08Y6H-B$W^yZbc?_I2(Ocgu8W#eaq&g>tFa7Y@OCO6a**df0Us|z5Y=Nb< zh7@OY%?tI^A2hibHXfx$2h&o+Qq`bV+LD&?O?G&k_jINA^sV>wwC9ICzd!(@7~+%E z(neH&i>P6qAK!A#z~XsL25}Ol>X)Y+qo*8USBrM4;iSL|cC167sKc2bd4e;3!cXF3 z?q#mi8#GL`D(QH9?>35zdM~rIPHM*=qa8I*)EXJD#k~T^-T{9Ydy&pnVI_Kzx8pNY z@m6w*I>Nc_NfSupVB(pC{B7c-)%@(}Q|6)Jqx@d}W&&2ESGGys!vbB!;?;hbDYYL7 z+>eW~ZavPZGv}#ttx#w&YJMF>b@yFEUAx+I_Nw8#PYp`>KzYOO)?Y#`)85&6Gj^RN zj%RknqmuN3)#)>Cr{s7HrhcmASR*q*AcS;`bmacPBrnXz6z7bDzet0m(v4VPP{Uyar`pR& zUDtZ%p4c@P3Gz^G$6INWM`#ZqmA=?_#kB>xj92Z)jRf^(EHqm22h6LWom$kDRx*)K z9XF)f%TNP(HGb%>zTmIu2PJ6rL$=M}8Nf+;a%>f$DSx}WeDb*BIF#V>Ot^4SMZONS zhn^l3pUDBI5E(kj`Bn^o0eT9k1-vTK#HE5-rebLrM<<({okic| zcE%>i2XP3xDe(T;X27t5i%wIWa|RI~(CA8@C9l_!hLsnGL6?OBiw!gB?ifeP?U?cc z5>*&rGhME<8~Ti|SP4x9fpmu3!-f&RGFHEtS|YCvxZK(%P3cj)WO6R;+_KJQ?qwDb z9<(I14ZB=ICfewZ#4{s9m@cD?x?JH^NlHyUG|FsKq?iPFV@@CvPMbbzkzV57+8?SU zYi~ov65mp!vxi_GxJEm8oc7VJk-V&iOFfuyQdB<_uz>Knf_c*pw72v&sSeQX*iPC807XhbovMEGy)*wBKq zAEQoBiOgkkZ~~J*=RzisiEk>A1cY~S9WvP#(P3P`#@H*bZcrfGaON_mcaDF{on)O{ zr6$_9GzR?kTw7gVU*5cRzPY<^S%x`3Jze!+MOo(hS+H@B`b+Y(@J6=^fC-LeDeS6m=dEzFXv=am zBJjsJAFRQwI@B9+_?BmeW88#U*6h|A7qXk0LsWk;jiX3QiHOFT@80?T-?# zw1zb-(wYh1l4sCJNUsT}^un{iT@{TS;@>4+sC1Gk?KE-Y`UvIsCha5fZ588iJiqO3 zJa_3WOqUwH?zenFjpP2W4ASOD$at&c0v6Y=SH-r2-+?&j4EzCIj|gs4h@PkUQGN$g z*2e~+)K2oFs1Nc(shQ$KQ5oh2{dbVEr*U*NAgK*nG$%4CrB@%Bx~J2&p>YON-W%aT2hCD%^MQuyd!xN46if@V@0wPR5{x_Y3}neJcI(SZPX0^UNooK(Zj@UY~%0$Vn8eZSM}$$6Q3Q zc8a@BqUcsluW*oo9exYj-x9EkUGn0Xt5lrk-z2LyiZkS{u1HS&-ks7-x##bfVYPNif=>KAju`{h;)P_v>gpG?AjGL;6z-cZjG zEtNUukqY9YU3iqc8Tueas05(mgyM8-l4~Nr2jU(MJQZ7n+Rm>Ws^p2?$4V6)_`+Zd znTxGizxSd0Ih-Ay@?B@{o7hNNp~bmyTb$z)Ik!xBP9neMsZ3w%L-AD#v3*nU+BA$b!V&h1?6?`(DfYg?moJd8dNMibW&#TO1#w;~XoKjn0 zP8M6M4sMW(9Cp>RJEP)q`77-m8sV5Vy4~a`h0DL2^3jfs!fHAhw?6mx`9;UA)|{M$ zr;I|LJ{Mi@7OjS5<#Z%EHf` zVA0(S^czph2cW$rQrgsyvc_Khs1Ev>jOC>0Bkv&}R?Rb7ltI5!2!2h}iY-cRAF^a)thPzP4U$iY~caMH^COQ=H&o}Xlobwvmk|eLr;Fdr;?qoJ)?%svj zn@&z?-#lMg!Dvw8qLo6Xb6RN|!21pyNP*DLQ2ly*>vrrqrIL2~dRnLMV#puB*1_0L$KRN@~)FRHqV&t-n z6id*akFOYWmR#&EwAtu#jdP(L@TPHo;Gw~hcG*B@JP&Q~@Ab1{#0^S6Sw_821Zr|q z$5*n%i+|R)OfqdFB_<>^Rjimsq4>XP_ReFCiu449?oz0Xa@9HRY%jm$wwqU46cX%l z1oAYsrQ_ZtBdBf%&Fk9yK0f;<;jKIjC~HvIS|bVwD1ljyz>FBW3T3b11$do@@x@Dw zJ95VSJ5FHKu24EU-$h9R>Bn&|Gq0U!56U-ellA?HxhC%ZEWJPH!NOO8FVsL&U4ha4 z8uTHJgbbm>BH#TEu_yHQhIHbXuu8(;)H`{tKbxZfYaW2h*QF6K{BoyHm1~P`R3v8> zGr*)4C#&w8YbBcF&cihe1n!&oX)<_`tNE4dsm}%4y7sfhTR~EHj87!@P`1>M_ZidU z&GiH?QYIFv3-I}!EKJ$E=WxG5O4Dd1k=m2Y=Ou*Hz?jVjhH=Lnc zt{j!rV#24j>yH3i%1|%S*qROH!`vWI^}OI@wMTGB0*H`&;$V_7zjv(AG?9LaxjWaV z5^+gr-{G6b5rV6OgXYyF@++2mKOgwqpl**-&HPM$wDfL1z{Va%uKQ7vM{)jePxI7_ zct-X55p%foQ_sZg7$jGHMvnaJ_l-NUPSYt>04ebuOaS6ILb5hudt@tP zw=`wmEXi>v=>RNa<|9qxQ1ti?(Q(^(J3{aL7n@WLuf1SF8ZPVGruS~jOP%EQ9`P4 z<1lP>vcUb*PHc=Nnz!@{=(-?z6{ls*1vds`4aNn+KzYR+vJw@~8^7G9_z}bjRQkds z7Gs;$)w^5pqCi)}kvPBHNAblU2-ezUj?^zhD>__~*PGLxjZ*Z{VsEQNt zBZSvGs5bSs4418lQD^#_Qt!?iDe8l&RoFVO)-~1aPfL@1Jrl9Jb3-hdpY-(5SB8m7 zvU4#P9v}-}u%g5&ZQUcZfetqv>Wg_Ep-Y(v`%)qnwtAL|aZ4$L8 z=6jisqEx=Ib?VXk!3kxD9l6FSY>^ArDRpMCv)T`?)7Ls9nRlh^G^{qK8q($1XIzQq zRes6gT6~w>$}-s~H8kGD>tfrvcYGC&2sZ786}zkXB-z}Lon zIVw z_Icj0&OE^)Nsa160halQdoE)H)lGU}Q-Ktfdo&2sVto@Ew`5f)!R^`bG~#81Q?@0B zX6x$}GF@aY!HTp73L`1HLH$lIcLrWw z1J<|$Kjqn+TZnjShXf|GQ4lx-sI*fC)#nhh=?oB!%i~STyCapN(91fTAsLkQ`pHLC zgV4)p4UmmL@syhX0!KpUJ{}%&Hg;&?x&^(v`T-HEQj4Am1u=2#ZQ9-pY*|in?q*Id zIy$ZAb0``uudAK87bF!M95l2&Pgo${6rGmFPy*Jao|$;AMLFa*Hd%z5Y_I~_m~SWe z;@MMoKq{Mhdr7uRNG^`{l#}=<2NyvwHM}9JIRbx5a8Qn+pzOUs*!q6|fGs{{K$s%k zVC@^@xLqT^R?4q#YIroIF{*epP&=RH*ksTv{|i2skEPB8=YsVQqNEF{H%D|@uZE_i zP@(qUIcJ0!T4sue50%{h)<2IR!sbGDRmARMXn#CbK;`Xa?2|i5i_(mP^Q_&p-n^z8 zY#q)s)$;S=G1Dmmf8;Qt>i1W@djR@1o`gIq-NN4d&_@N3T+wAnwide%rFaL)@8Zz% zVNCf50gC#upCTVQohUI_NiSX!+J6Jp$dRHVV@KWdF-t{A_lxW@3xkJRu>g ze4FiG8OX^6d9_Qf*qSb6^jB8%6(RhEeUu#Pl;M!TL`HcQ9bS{dOmvo3jt>urVF&A& z=}w(dr{x4Bn#(qvP4u;YPTHH=p6j2ccc4@E)>gPzgOo*g(qISckk`n-?0M#G@`iYM3u2p1fr52i{RVM=U*F8d}<#IPL3FPBNjHE-NGK zPtW*FrgL#7m95`9CVFY0KG?XIu3D>Di{NuCp4HaZ7ZTKr&y!eqRvXynPV5g#C12%{ z#mt>p!-kvL%!E>s@-q6}2{Tdt;`SD`^Y;zQ$cLxJ_bqOeT_8PDucB&gzJlI3w(1^R zq$5nV2^t0D@dejKbNu4FQywx((*s?5F4uEE4IHm{)QF!}1g0y%*Jz-px9&%QddB7P zyx-WFUFL+#>iGbm#dCehZ~&*_A3mX1ZanLRu^39fa2tohIY(%fXGFwJTn4SK>wU!& z-hi^%;pwqi*ZnGj3&-ttksgAJ)DGg15O6B(`y(|egl-S*&IY-f?5mywQ5nKBf_1Pd zYdwwL4O6QmVsbK3NY#eh32{K<*5$xE@59yw2Y4{Dw3s!tM|F(E>yKeyPMxD*W)Qmiav8ogWUa?uM{|F-ryn+ASl9Iax^J&XPEl1lQ=KitVFp0 z{B}P;LnT9Ka{$1F<@AumUt4Mla5|-5+|=n7^<6d`3fI*2)i{%qXhBY=MNAdAyX(tZ zZh%%ho4V&P)thsZYy>9}$%hXT#JRjhB|0+`9YcOe5ZLBVaVEL$K-)P{NL0X=j7Myx z8>086$o*9Rf^u`GN=q`1+WVF`kbDXo{)%2jwP;NhNmk0==;9%dbqHft9=umqSSimo zXMPu0ak*w1QB8&rBO;lrPq{?(tC%TGin@qp_J=i%QSfh#o42#*-|5^{M;uaMQR5g8 zi|QKnFg0zUkewhb0+ra+T%`2aC#w0nr}?7v#^f3a=Sjm0jq+ka`q}W;1s+kaT!g== zmoQs4+%&&V1k@oth%ntDm|`(PvBNY&ID$E{9KcVyOe&<5v?o+OmR~#Dt|xS(VAJ23iwD*BG*u1W9Fmsy$Pi zA8jNW76 zLC?YK1^kpYc@abz9CbgqnKqCcC0K5{?ayJ4FUJfV?3h#4jJ+M-R3z+xj4L1!*J;n<8@#N4KL9=`LRCIRvn%t}yT+TTz-(qoL2KrkBK$l2NE70{2 zQ5Ji*?Kyt|4#Q~#Zhk{i*!?C5a|FD+P~=62zA)lp7?l9*$P+!aMH>VXuEB=%|UH4*487yN9wA<~pw#DYRKPr%?Sx_TGSKzVWeWS|A&?$g*U ztHiaf!1GNtrd70cu@p6cPLD}xYsca(wO#o$Tm#SQ@`dy%kPfws|%&4=0oM^GgTWm&vm&c3@*xAi+*AcChr|-qezs zLP>+Y)PpvlCe#8=r4DZV+Du*0yhjSxYd*MqZs9qu^^7Hqx zQ?Rot*%35zobmc15@jd(u{Cs2glKxroOoWVRgZ zN(x%*@-*Kj4>3^PwJMxa3UIT?Sr<(P+k*Izm-W#i6pFy!cV=K~tit06r+ z8J~UgKaY-abie(TlaQV(do`@cP~i0y+PUcr!Y-D1)-aR&5YP=HP`^5RN?dG{Yp&Gx zV6o;gXy+35KNP3BqlD(6tO!lgF^s&Nk(owJszHgUeSDxA0C3yark;gUo&D*a`x&#b3_he>&*%Wqhf6`igp;Nj2^gx>gExY$q}kEA zd1O|x9+Urj_@zVB#OPF|H+O?Neze5S3vz0$S(IyyRt*)6r6G29bWgHxpF*cBMVw##0^b(zE26=*Y{ zdTT_ih^bHN-c#h9VJ4ir#ylQ?UT3S-HXf-aMQ53O%3WIOBmt%O@N&LYv(n_M9cZ;2 z6NRqQpj+5Cf}zo%ewJfSqN~Sbe|fKHcZ#4`3IK*>eUR-K2L-#kSK0#y^7DHXUw`uh zK%L&sw07BdW#`IrYN^{koX)+xySo%9nBKf}^?ad45>RadS6zZ8eC>dzP&m=*8926c z;F{MCPP#5#hovZ!ZJx9N8OclTAqK!cvui3B)xY zS(LLTwc`&#?$_b#q+mURVm>vx}3^$0NRvm4L8JoN{y zuD38QJYD~hO8J9no8O7F-xU{l1 zja89Pn8`;WZ!A%^dStGAT+XM`%rk9?(qfRg>Rm^L3L2AAZO`6#Z5!7Cg3JZjsPzj!MtO8(=5xBJGQ6;e_GrmT) zX_?WFSE&N83EU<}mCY!~QOl}l`Q0;E^Y@h=qF*(+j}ezK#pzuqhy|)qYybVi8>H*l zOk8dh$3;jf^1H}|$N<64qErEf+XSRe*B82I&57EJJ5pmFVo&Eyu^X1L_nu*!%XakW zUEGT|aAgWiSKGZr+uIkqO@U|J=B*T4EQ2uu4b&WiRyZn3PNHg9nvU8oI;|FC3 z9{vuFIbRz$aDVm5TmltGQCdn8@B5NGmo>MorYfDsnwYGO!rvA=Pw+0SPb@wG4fV;j z@s%f*0L<}tZn9Lvl1U`j&FYC!){FD2#qWWFTB!wf**AIc$k-yCdYXsTay%-cm^Leu zE94}tW88VUO6Mp}TlcV@at|jfqV*Ju<%&|JjMM|#d^sh<&44c(G`d;rJ5+|*wTM3y zcr7q(Rg)jA_+n@DcsbsMPwCb)T7pBs!ATzu?N`Cjy%R3jJ6>2Ij9Wz-N;-#^Do&L8 zlFr?Oy&G)HF7BrVASI(Ni()+xe%;s6SgxdNQiybW>Nb7NuWH77#vrV83{3Wf)oOc? zw4f7BiMv8%N8jl_ZyIbascm~JVytlDSq?}2)|3l#@POd8=ID2&UXbswNFJWViX4f( zd_Qjd2;uu?emWKg)cwm1+JTxZmx>ahic(LeKv{8tn3g!aV5#(sx2h-?yhFV;}tcv-Li!7bZ=h;v8VI8;{qg zib$QdujOyQqr3%vA;&KT>jcWwVlTF2^|SeES+bPz19>b#15INs4^--2pB$f5EhU_o zg{i=t#-~nJJ(fo*xyVS5kCF68YkKv>DGb2kW^nrJ4A;GX)fwi%|2Muw^nY_3QgpNZ z@Magbv33+Raxkzrvvstw|8N-kC`;>EeQ1;vrKQ!?C`64cosAsL4D_UpoQyv7mU<=* zcntLasL9W7gh%_~LGNH- zWbKH@#`Mwjv%}8<6+PXDZ@qw?t*DWiiK!zV3mqdQje?_*mGVcuPd9u$Lo;g=JbG40 z8r6SRVWMOCM-@>#)=#U>CMsrzAHMpGj32m^p6fqK^z`%}?LX)8;i7LukN3x4O8(O~ z|1Ym9x_`S8AEur39sewhqP>&R=Mq8E@as7keVY0om*l@KAs*f5BFjV42w8u`1AGqu zV^{|AHdcDpe<2?uGqZPa6fo7Z|Fe>kdjBl^8TFqTIN)i}v#@>`Vx(vO$p3MDBJ@mb zf6^!NS!Q7UD1Y7=81R^wnEyYeKX(3D|I0EnBP$*=D-#|IJv|-^Eh8Q?9UUGM3nL!O zM{l3lXRl0jY{x$Z`_8;T@lYjQf z#PDHmKKkJj_Dt>{*YOi zKKiHoNFOpI>tD0`f8qW)VmjK7?0>pGr}e4(`0=#&K(^pW;_4l%*swMUcGfTC|rJUs!dw^}n^Ss`{?#+xn6ymWWwW=*pRz zC2A2c)pP=qi^_`sP&6mjajz*|lV?&Y$RfW*Fmot@=JHJxjryU?&vgr@#C4|`hnezW zZz+YJP_gfLiRC;!m1W(N;n{h?dW}3jTbxMV-MXM!d{1WYzOrfABK#2H8Ial2%1N*9 zt|2r(D`;1yQmx%H98R}JHKoDxcM!HVi0&iwsiVD}(P{K<%bWjKn)Q^r<=un}Ol3%y zlgC3c&s}%D0@K@jZ@}d`_w8+_<^9Y0xpbixo|z39bt@*9;ATmkxMUPDf-Xz9OxZYh z-sDXJZQ(c9tJS3bhU^;OLAR-x{VIEbjw+w`{WI5}Gq183{>MKFQw(~Cy)%Au!A@%1 z^Dq*o8l0qS=6+{g?)>7%Z}%(gaD4hpUr9pRQ~{TCx@juw7c1X z7`{k-_Ld=(@RAkH|I7#*p1YQ?Gfnm#hrJ2Gtq;y%=C?>c}48z{aM-TFgPhoRj zCExae6|%#bGHCguAyTxj1ExyjsvIje-5^`NY~xK8p~Sme@O#w`HdA~Zws;Mk{W@AL z*Spp9H;X&~b7dT&4SE@d{CKEC@op{KEA?pm!NZpF%%l(RC?vYOhzqpW6~&Dvm>3ft zd-(7oh>2rn@ppFia1P$|d3fm+e4^~~;EXBUP2N?tRWMB~+qhmph)g=*;3iptw-qrM zTBaOk@L7NsmDi*K`8m+d^tP7nwziq>3Ej}!FZ+S(u_W(%lHOZ&TabD4P!XtK7sGUS z!6>XTvk4Hp!8Q>s%d9y3?z_*|yvJt;!XC|2gjwNA01aUVckJd7ws?!dLhbOBfxlpN zatxwR5BPCIg|dy9=o05P%6jt3r2KHS%P-;Qs%~43CBf)qO2lAi%D{mB3AS!HMcovD zeHC!f$L-~KIbCp>(J^ywaqZ;HCmYUw)R2EN#vwIs2j8u)Mva1Z zuil>YB{qCbv%6|l9pYW8TkbXfCYPtOOw{;SEYpozZXUT>uOHzEK{5v+6(Jdf zWD28-aP`2glG4&g3JMAn#!I6{{nRSw*DTey#C*@|(|D~tJf7TieqoaO)Fdr7S~}t+ z2?i^BSRmIi+JuCWrqkeFK|7S?i=PB`bQbhlGzWj-c zWcsh8?^k|7CEPAJ$v<7Ce8b9>fx1`exGS zoT#O8W0Q-M;d)%f;UtUWoSyNVECq)la(p>ZLN$R8&DK8aCr~-VBR`X|F}9k(p0@`5 zcqT5e=ny~}Y%L|5RkcU0US~RjhXhK(HR4Hov{%%PMdUhb-b81ryslloGYxzD5qfUP zyq5{t`PDDd1)aWq6nIlY`}+MP$?8TyD_S!i{Dce&H+Hh%wKoau@nF4H#c<^za%vpp z1b(>qMp{@<8wfGYm1BRGzYL1CoBJ) zhT;ch|6HvLjxk-MMz|F}hyRF|!{g%P|KRA${-d_flL zT}fR601!=3I50uPs=@ZUwnIpkezETf?e&FU)m&3~l&0r@(_rShfVNzdQOa~EZN@!x zanEP+epzhD2ql>K^&`bxEW0Fsq^!TN9$({~k?bsja{!W%{DPMjYsa&?QB@-vdtj42 zABzPC+xj4y+9&V4qi95@lKWK<=Jf*j^TXnYx|yU<@rJI_XW&3 zxHpL3qH#m+Lasg2>Qhu&@5$WS!0Q7#mC-pZgg1SJY<>oPxf}$r`Xf%O5Lf%DoPH}1 zK!Vs&ov1~AGk_0QEOsk*aS?dW_cy?~ECo>eP zH3xnH&ECtV^@UAK!n=hmaMJj)paAURHU&Hd2r|2E$b!VnAFzK%eO?#Drm3EngC3rc zGkNrLKH{57-YozN|IHe#I8wJMI8nQ6%}Td&c|Qk~l%9Car?n-88((TsVkFG0q+;A; zuCC^-HnlIqqazx|11)#2q1JFU@%@)pe3@c>DLGlil_rr=R32&SxtI_C^*wKNfM9GI z%T@+&fS_TadTQJ^2691&Eu+K}v7cX+TH*zh$6NNboCBX{P}a{el-2~4IZ|bT z)Gm|E7a}3^U}w)7c(c<5rIB4AYp~6FuR%bz>d7^7hl9hZZRVJh>p@R~&q^^4lV%-< zaCyT3%w0u)CQ16VSDy+LKiK15W+8~lZ`w;W{tT3cS+j*a{EcbP(PM{aMRBdAB^sPi z#&3d&+AK`Klmi^Ex%Jx|&_jDILjKiAe0&orG{Iok&_T?$m#lrmF?sM*K<(5%)vO_< zQ@ho63{M-jTBd1_ju>G|G=f_Iz;- zl8WLLxqtw_+Jzyr_by zfCJ_BvxnH2&;~}*`14S0B7J}oT8QtryCz#Wxr50k5^{mt4KfYA7C0uB3=< zk}>yy!FtsV7;O1^7B>uu=#5R~77Vsvg|9d(#ro-GRo`6jk3y^^n!cTL;NUkhgRc~A zfaax-E%PfN*D8T#{5nUMWR)6<8f_4E{VC3Z+U#-3bz<(m(QKD<3rx7E6lo=^{7S61 zJIE7gX3947q?iWP(?C{HpFJwub+6;nz71J@_Ix8?lhe&cEX(~ZoHM+YT#CQphc$W? zUC_Qbw$X}T*-ug2lzIoa)si`?teASlGGM`NgX)?0Tr35VIqg*wB^P>0^ftGmFGen+ zJetA*+1!3>#n~0Ve#Tc6-%~(xMGo&Vkofjl0Ee7=22AO-@*&PCKz;!s(-pU8Gk5CXW1s?=kh5XN_jzsh zj4`oPH2z`<{J6$+sb#= zE=-Qj^-k~FVcKVL*GnRcu1Seq22p$xO_;x0tIWAMk4wcfWQ(G?QVoNz%&$)>Yv^kX z7)H4*t5pdc_X`nj_4QhGU%%MEt;w|yLk0l(9OJC2Kb!hb0Ukq5%E8LIMdPO?;|$J% zjU@SX6{=k!ew7U3@I^||WG!+~bP}N(TgXJ^d)kaH=6kehK z2)a7NnUrRag6KxehtP+qy4E(Ma=S3^q(0>?q&tANvx1H4r95DS+N_ni0LVI-#>DFNxFUwsO2b&?3 ztV~9ElFg|Y(DVLXY&ExvPi@@Y8N`{8u-r{U>qjyj&T1DBxnfqGdGG`wO>0aKLlk;y z^73xYk0U=tC<3Oa3mcv616kQfuhSmvY}la)z@HHyKKp(Vgd?!eFWX zH1jwa&aXZ~+-qcr)~1ly(y+x081|SZ9o#d>%JCXtFV4Vr81O@=>^TOkF| zRN)Ggl1(mN3gsXoQAwV4Ggn<1ejWDaA&x=qM+Wfpc+y$pCN8#%`CS*ndec{PuVVux zv!yL#en(nr=%JqQ>6QK#6vP9U?u9IQju7F4*0p3%BoEWm(FS2{)4iOW{kU9)r5*?z z;=1p$!X0o6CKve1fQi{zi5sO+Rb`{wz9M+xX@ULZ_defTy-v(w*}A3_+FDpkf$&lb z8+_-9jx6QpQO)dgy5tCO_8Uc6L181}5a%u|Q)S|=oVDM3-NX0(2+J3@$2t1?_4ysp z*?>9V|0IF*e~;Y!|D7-W#W?AiSn2*EZ1jHxl>Y6XLx1%DkuPPcEn6>(e)1)xNG(Ai zL-j2of`Bf7Fou?trAzDDDLhy)9CgJA!_kOQlG6to7jL}Ix1p>f60)IMLAN-7`5^Nw zZl@zfrR!+!No=W$2`J8yyCIjN@3^vLhlsn5P8Een}A9 z)^r0;cfF7=02Vx5J-(zMoGt+UG(|gOXfb= z5lmi%n*`EGn;ki-;YpMtfylx0q3zRKEKf#&o*gTi=X}}H7$?Am<5^3}v@Yxib)EKD zCg4vBAg7)-C=$tQ27<^m}}~UJ?@9u6$)q=^XiH13>|82}p-xJ+*lAK$Umk z6L7>@k^DCZz(P!nuP_3?$vl{|Gl3d-?J|E7`%vBvKFJxoNs7OjD-E@>Nu<7x-3CV-0y z$OA2cG081(In;9waHKlnqlv`D5>E4i<0V;>6!ifQ-W%VP&uw~C7&Fd7LW5Bv zBQv8S%hD!?av0}vNY06nLqmj|iuGk@mm(HL4wV#g82aAP?zXtTKfd>`dGG7@Jiqt% z%~G6;=~sU ze3eON1@YsKuh?ftd%U~4Kd6+t-p*aQAJtU;N%u*fqo)V{V{p2|0mZwwqUH6%tY2m= z%{8t6GJIG7!lcV!1XK}x+Cr_fI#K0EhWtXCTbOftWYuu7Yn7!igKA5Gh&u+`CXCuyvd0Y@ zH{OO8XkN3b$g~q-r4`z#PJQ~%!{RCDGL^(Xa|iJ20%X%={^gK-!8P6I?PnCrTK3*7 z8sLc|9kzRdH*TG3P#TRehs+E6ogT2i2LSbd1u}c&? z?%o}ebE;7kd7wZ{O)OsfOE=g{mjOyWKC8{g zWFEnW*$FG$Q2qQoRTJSv$E`a*oiUVaafm8Etu$Ect$8if+dyNz6ExW(6^R~xcv|deoqdLGQCBEO8Q>+Ke%BZ7q2(RC$Kxus42-&#>wT$ zm3DzPbR_Nq#Ee&>-h|KYu8rx7nXBc?N0UTq`{fPw4n!6>Iyhc8Y1d(k{ZjDU=X^Fh z=W0alecm{tk#`W3pGd02Ein^0R=W&KI#z=*Sdoe5<_WFu@8HM{mSL4jxkYYN`&@&o zEc^U?EUHi7a@F-qOM7iZYOV6?!M8+ojk1P@+$xC`rE*{sX|a<(2%hJ|BhuUq?Dy=I zd-6y4OSo*zJ_~?zg(#jQMd?JOkvJQGXrIL$jSE^ zPmR+>2gP^hH-qcyk&MUL^kboxlfna`h8;g=69w%*O2Q#GWp&p(kOxdL!U1ivPK6mu zoM*9WxL?Clj9SLe<@=VDW!UpJkzStNS6N3$szZHdDx%)H!?l?7@AU4lTtg)As#P7d zh$*m(3$vld;fh41+Jj6`RIOG@opOBH)o1r=M=Dat-3FZ9$scH?(?Ol0bGQD$CX3T@ zXQg;q5Vmn9DEYnU$8(1+^%K}HBaSzTlbUO$mYxgOMCD;yHRDD1!-?u^r?nfBDWMj6 zVNaHZ{6Q-dpk1Zm?d@?>5)W{oeUOG{W?g0esh5`RKI@1D(V?Q%BFSOe`4{TNwN=ce zZv>arSbB|Z_%`NQG}e~e)XR8P87=%EeAX|}{&(wrjEFV&iZFwV%Rm1xAzgFiQJenC z8(!(Y&Lb~ii>;Z+63$56PeHn(`ypqlweXj%N5W^of;d&dggfDDnzj^{E99VuRQ$XB z*Ai+a-fO2S`6wKwgJ5dzJ|Rb5E!-m3XUj?mVwn(YXR^b!6GHg?Z z`wL}ziUbEuhU$k2<~Qa}(jHTHM;fcMA0oi8BR#!9odQX)}rC)~&C<<2@w0D)Lr*p}cQ#Ja&Q*UDR1R$kXm4te~8O z$?X;~N5*Hm_f!O74>QMlN(k@g%gl5ZQZe!4K|Y21hUaKvH`YR9mn%eQ1%r>Q!qa2* zqu-6b3=|o8_Pw3U;b;OpUFj4iVKn^DT^?f{`@Zl`Tr~MoJtZT@Q0|qUsXoT{5Qm7G z!zUd5rE)lXqho0eaaTum!;xoyNOS?AC2Crk-q6){&YF35da~F4hB3e4@KhgqR_5|s z!gQ7rtWqAl#;UmM+CE5Sc$`rVR5q>@4aB8>DsJYaxHF(vG6T{C>idh>=*|=F{bT1P z9{q9N6S25h>xS;-tOj-T*2EJ}b^7!tE6Ko4)e!sjeRT2tT{+J0dmuG_4?hlbtQS8X zT9wnwC=j^F-?*ziFV@3!|0>S|bhJ)~Yo=WBKK@TLah&6e;Ond&x$2CeNsfbak(q)@ zf^OQC@8j^0uym)#6#_WM2F75__K&M^W0tkAYeOF1lnTS`v5p;F=;PHI+?70~C{<5( z=~I1s`<L}>Z_G61Gb6-_Z6s=$*lR& zV+Km2GtoklinbVZ$$j|^g5|<#zsM{8$JJIHDdQI2F)!0kV$5dabLO8tF~Nj`EVWO* zUcWG%o$~Hg?dJ=j%Q7u7`C>aaxPNaszme%yq%(kuP9%GJlQ!)FT$UN7g;IlKpco`f z4Wk8cgf?U~i1wSLC3KUG-DE!L+(-VmTnkLTRcbIM0X_-J8E%2V zJb%>(m>>e^9yGfEGTDYgp+UJH4O^J{f0qq>MR?LZfO80yya|**GC;j51ItbE#Bz@- zRKjY*HQ-1M7z_i0qtF_dlgj^koJ#>@FECeA1_RRb*9!qfqfiJG)JyOy2G>Aym1ph+ z_4@}yp@2%aVB8yE={B*y$9}!riorDy2w)L6 4 + for doc in documents: + assert "name" in doc.meta + assert "page_number" in doc.meta + # elements have a category attribute that is saved in the document meta + assert "category" in doc.meta + assert "common_meta" in doc.meta + assert doc.meta["common_meta"] == "common" From 3bc51a788356545dc692183591cf91bdf5391713 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 18 Jan 2024 14:25:36 +0100 Subject: [PATCH 04/17] Black formatting test --- integrations/unstructured/tests/test_converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index ac10c4e1b..7ce1069bb 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -154,7 +154,7 @@ def test_run_one_doc_per_element_with_meta(self, samples_path): @pytest.mark.integration def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path): pdf_path = [samples_path / "sample_pdf.pdf", samples_path / "sample_pdf2.pdf"] - meta = [{"custom_meta": "foobar", "common_meta":"common"}, {"other_meta": "barfoo", "common_meta":"common"}] + meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}] local_converter = UnstructuredFileConverter( api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element" ) From 86eec4d70ee57e584f9e74e21dbe48ab87cf7771 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 19 Jan 2024 15:44:31 +0100 Subject: [PATCH 05/17] Fixing metadata page number bug. Deep copy of dict --- .../components/converters/unstructured/converter.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index c9db4eea3..5fd886800 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -131,7 +131,6 @@ def run( meta=metadata, ) documents.extend(docs_for_file) - return {"documents": documents} def _create_documents( @@ -140,7 +139,7 @@ def _create_documents( elements: List[Element], document_creation_mode: Literal["one-doc-per-file", "one-doc-per-page", "one-doc-per-element"], separator: str, - meta: Optional[Dict[str, Any]] = None, + meta: Dict[str, Any], ) -> List[Document]: """ Create Haystack Documents from the elements returned by Unstructured. @@ -149,7 +148,7 @@ def _create_documents( if document_creation_mode == "one-doc-per-file": text = separator.join([str(el) for el in elements]) - metadata = meta + metadata = meta.copy() metadata["name"] = str(filepath) docs = [Document(content=text, meta=metadata)] @@ -157,7 +156,7 @@ def _create_documents( texts_per_page: defaultdict[int, str] = defaultdict(str) meta_per_page: defaultdict[int, dict] = defaultdict(dict) for el in elements: - metadata = meta + metadata = meta.copy() metadata["name"] = str(filepath) if hasattr(el, "metadata"): metadata.update(el.metadata.to_dict()) @@ -170,7 +169,7 @@ def _create_documents( elif document_creation_mode == "one-doc-per-element": for el in elements: - metadata = meta + metadata = meta.copy() metadata["name"] = str(filepath) if hasattr(el, "metadata"): metadata.update(el.metadata.to_dict()) @@ -178,7 +177,6 @@ def _create_documents( metadata["category"] = el.category doc = Document(content=str(el), meta=metadata) docs.append(doc) - return docs def _partition_file_into_elements(self, filepath: Path) -> List[Element]: From 99a284786135cda285b8ee4ee501cb0c6a2ba758 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Mon, 22 Jan 2024 16:39:41 +0100 Subject: [PATCH 06/17] Folder of files test --- .../unstructured/tests/test_converter.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index 7ce1069bb..2a9c332f9 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -169,3 +169,22 @@ def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path): assert "category" in doc.meta assert "common_meta" in doc.meta assert doc.meta["common_meta"] == "common" + + @pytest.mark.integration + def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path): + pdf_path = [samples_path] + meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}] + local_converter = UnstructuredFileConverter( + api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element" + ) + + documents = local_converter.run(paths=pdf_path, meta=meta)["documents"] + + assert len(documents) > 4 + for doc in documents: + assert "name" in doc.meta + assert "page_number" in doc.meta + # elements have a category attribute that is saved in the document meta + assert "category" in doc.meta + assert "common_meta" in doc.meta + assert doc.meta["common_meta"] == "common" From cabf5d882d49db873346c044064f7c7a919a6961 Mon Sep 17 00:00:00 2001 From: Corentin Date: Tue, 23 Jan 2024 15:15:40 +0100 Subject: [PATCH 07/17] Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py Co-authored-by: Stefano Fiorucci --- .../components/converters/unstructured/converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index 5fd886800..1c3b2a9e9 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -149,7 +149,7 @@ def _create_documents( if document_creation_mode == "one-doc-per-file": text = separator.join([str(el) for el in elements]) metadata = meta.copy() - metadata["name"] = str(filepath) + metadata["file_path"] = str(filepath) docs = [Document(content=text, meta=metadata)] elif document_creation_mode == "one-doc-per-page": From 52b06638fd110c07759955dcd609daabf1033b79 Mon Sep 17 00:00:00 2001 From: Corentin Date: Tue, 23 Jan 2024 15:16:07 +0100 Subject: [PATCH 08/17] Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py Co-authored-by: Stefano Fiorucci --- .../components/converters/unstructured/converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index 1c3b2a9e9..9f72c98b3 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -156,7 +156,7 @@ def _create_documents( texts_per_page: defaultdict[int, str] = defaultdict(str) meta_per_page: defaultdict[int, dict] = defaultdict(dict) for el in elements: - metadata = meta.copy() + metadata = copy.deepcopy(meta) metadata["name"] = str(filepath) if hasattr(el, "metadata"): metadata.update(el.metadata.to_dict()) From 0f81c1696a93618de8316b116349489dd487863d Mon Sep 17 00:00:00 2001 From: Corentin Date: Tue, 23 Jan 2024 15:16:28 +0100 Subject: [PATCH 09/17] Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py Co-authored-by: Stefano Fiorucci --- .../components/converters/unstructured/converter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index 9f72c98b3..94bc19082 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -103,7 +103,8 @@ def run( :param meta: Optional metadata to attach to the Documents. This value can be either a list of dictionaries or a single dictionary. If it's a single dictionary, its content is added to the metadata of all produced Documents. - If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. + If it's a list, the length of the list must match the number of paths, because the two lists will be zipped. + Please note that if the paths contain directories, the length of the meta list must match the actual number of files contained. Defaults to `None`. """ From a7d9b74e98b443599ff86b9903ad3447316eadfb Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Tue, 23 Jan 2024 15:25:14 +0100 Subject: [PATCH 10/17] Renaming "name" meta to "file_path" and deepcopy fix --- .../converters/unstructured/converter.py | 9 +++++---- .../unstructured/tests/test_converter.py | 18 +++++++++--------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index 94bc19082..83b4457c8 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging import os +import copy from collections import defaultdict from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Union @@ -149,7 +150,7 @@ def _create_documents( if document_creation_mode == "one-doc-per-file": text = separator.join([str(el) for el in elements]) - metadata = meta.copy() + metadata = copy.deepcopy(meta) metadata["file_path"] = str(filepath) docs = [Document(content=text, meta=metadata)] @@ -158,7 +159,7 @@ def _create_documents( meta_per_page: defaultdict[int, dict] = defaultdict(dict) for el in elements: metadata = copy.deepcopy(meta) - metadata["name"] = str(filepath) + metadata["file_path"] = str(filepath) if hasattr(el, "metadata"): metadata.update(el.metadata.to_dict()) page_number = int(metadata.get("page_number", 1)) @@ -170,8 +171,8 @@ def _create_documents( elif document_creation_mode == "one-doc-per-element": for el in elements: - metadata = meta.copy() - metadata["name"] = str(filepath) + metadata = copy.deepcopy(meta) + metadata["file_path"] = str(filepath) if hasattr(el, "metadata"): metadata.update(el.metadata.to_dict()) if hasattr(el, "category"): diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index 2a9c332f9..d5266ac62 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -63,7 +63,7 @@ def test_run_one_doc_per_file(self, samples_path): documents = local_converter.run([pdf_path])["documents"] assert len(documents) == 1 - assert documents[0].meta == {"name": str(pdf_path)} + assert documents[0].meta == {"file_path": str(pdf_path)} @pytest.mark.integration def test_run_one_doc_per_page(self, samples_path): @@ -77,7 +77,7 @@ def test_run_one_doc_per_page(self, samples_path): assert len(documents) == 4 for i, doc in enumerate(documents, start=1): - assert doc.meta["name"] == str(pdf_path) + assert doc.meta["file_path"] == str(pdf_path) assert doc.meta["page_number"] == i @pytest.mark.integration @@ -92,7 +92,7 @@ def test_run_one_doc_per_element(self, samples_path): assert len(documents) > 4 for doc in documents: - assert doc.meta["name"] == str(pdf_path) + assert doc.meta["file_path"] == str(pdf_path) assert "page_number" in doc.meta # elements have a category attribute that is saved in the document meta @@ -109,10 +109,10 @@ def test_run_one_doc_per_file_with_meta(self, samples_path): documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"] assert len(documents) == 1 - assert documents[0].meta["name"] == str(pdf_path) + assert documents[0].meta["file_path"] == str(pdf_path) assert "custom_meta" in documents[0].meta assert documents[0].meta["custom_meta"] == "foobar" - assert documents[0].meta == {"name": str(pdf_path), "custom_meta": "foobar"} + assert documents[0].meta == {"file_path": str(pdf_path), "custom_meta": "foobar"} @pytest.mark.integration def test_run_one_doc_per_page_with_meta(self, samples_path): @@ -126,7 +126,7 @@ def test_run_one_doc_per_page_with_meta(self, samples_path): assert len(documents) == 4 for i, doc in enumerate(documents, start=1): - assert doc.meta["name"] == str(pdf_path) + assert doc.meta["file_path"] == str(pdf_path) assert doc.meta["page_number"] == i assert "custom_meta" in doc.meta assert doc.meta["custom_meta"] == "foobar" @@ -143,7 +143,7 @@ def test_run_one_doc_per_element_with_meta(self, samples_path): assert len(documents) > 4 for doc in documents: - assert doc.meta["name"] == str(pdf_path) + assert doc.meta["file_path"] == str(pdf_path) assert "page_number" in doc.meta # elements have a category attribute that is saved in the document meta @@ -163,7 +163,7 @@ def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path): assert len(documents) > 4 for doc in documents: - assert "name" in doc.meta + assert "file_path" in doc.meta assert "page_number" in doc.meta # elements have a category attribute that is saved in the document meta assert "category" in doc.meta @@ -182,7 +182,7 @@ def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path): assert len(documents) > 4 for doc in documents: - assert "name" in doc.meta + assert "file_path" in doc.meta assert "page_number" in doc.meta # elements have a category attribute that is saved in the document meta assert "category" in doc.meta From 736f6992f4e65fca35e0a315d82123c4f144535c Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Tue, 23 Jan 2024 15:30:57 +0100 Subject: [PATCH 11/17] Fix Ruff Complaining --- .../components/converters/unstructured/converter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index 83b4457c8..bee1d9a7b 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -1,9 +1,9 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 +import copy import logging import os -import copy from collections import defaultdict from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Union @@ -105,7 +105,8 @@ def run( This value can be either a list of dictionaries or a single dictionary. If it's a single dictionary, its content is added to the metadata of all produced Documents. If it's a list, the length of the list must match the number of paths, because the two lists will be zipped. - Please note that if the paths contain directories, the length of the meta list must match the actual number of files contained. + Please note that if the paths contain directories, the length of the meta list must match + the actual number of files contained. Defaults to `None`. """ From 096ab495c3a2488ae7d5a2e73e06328cdf390392 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Sun, 4 Feb 2024 12:25:55 +0100 Subject: [PATCH 12/17] Removing unique file logic using set that does not preserve file orders. Raise error if glob and metadata list because unsafe --- .../converters/unstructured/converter.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index bee1d9a7b..1dd339fc9 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -109,12 +109,16 @@ def run( the actual number of files contained. Defaults to `None`. """ - - unique_paths = {Path(path) for path in paths} - filepaths = {path for path in unique_paths if path.is_file()} - filepaths_in_directories = { - filepath for path in unique_paths if path.is_dir() for filepath in path.glob("*.*") if filepath.is_file() - } + paths_obj = [Path(path) for path in paths] + filepaths = [path for path in paths_obj if path.is_file()] + filepaths_in_directories = [ + filepath for path in paths_obj if path.is_dir() for filepath in path.glob("*.*") if filepath.is_file() + ] + if filepaths_in_directories != [] and isinstance(meta, list): + error = """For directories as path metadata can only be a unique dictionary, not a list. + To provide different metadata (list) for each files, + please provide an explicit list of direct paths instead.""" + raise ValueError(error) all_filepaths = filepaths.union(filepaths_in_directories) From e2b3852c7e8e46548d8fabe266fe16bf8464ea4c Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Sun, 4 Feb 2024 12:26:38 +0100 Subject: [PATCH 13/17] Better test to make sure metadata order are preserved. --- integrations/unstructured/tests/test_converter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index d5266ac62..9e359ab6f 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -154,7 +154,10 @@ def test_run_one_doc_per_element_with_meta(self, samples_path): @pytest.mark.integration def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path): pdf_path = [samples_path / "sample_pdf.pdf", samples_path / "sample_pdf2.pdf"] - meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}] + meta = [ + {"custom_meta": "sample_pdf.pdf", "common_meta": "common"}, + {"custom_meta": "sample_pdf2.pdf", "common_meta": "common"}, + ] local_converter = UnstructuredFileConverter( api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element" ) @@ -163,6 +166,7 @@ def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path): assert len(documents) > 4 for doc in documents: + assert doc.meta["custom_meta"] == doc.meta["filename"] assert "file_path" in doc.meta assert "page_number" in doc.meta # elements have a category attribute that is saved in the document meta From 1c4802e6f1004d27ed44366c1ece2ae0b39a42a9 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Sun, 4 Feb 2024 12:27:16 +0100 Subject: [PATCH 14/17] Make a failing test if metadata list and directory --- integrations/unstructured/tests/test_converter.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index 9e359ab6f..038807b14 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -175,12 +175,22 @@ def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path): assert doc.meta["common_meta"] == "common" @pytest.mark.integration - def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path): + def test_run_one_doc_per_element_with_meta_list_folder_fail(self, samples_path): pdf_path = [samples_path] meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}] local_converter = UnstructuredFileConverter( api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element" ) + with pytest.raises(ValueError): + local_converter.run(paths=pdf_path, meta=meta)["documents"] + + @pytest.mark.integration + def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path): + pdf_path = [samples_path] + meta = {"common_meta": "common"} + local_converter = UnstructuredFileConverter( + api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element" + ) documents = local_converter.run(paths=pdf_path, meta=meta)["documents"] From dd40d0abd5bcd087f23e1b5d0fc1cabdddb93529 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Sun, 4 Feb 2024 12:27:33 +0100 Subject: [PATCH 15/17] filepaths as lists --- .../components/converters/unstructured/converter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index 1dd339fc9..84143d557 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -120,8 +120,7 @@ def run( please provide an explicit list of direct paths instead.""" raise ValueError(error) - all_filepaths = filepaths.union(filepaths_in_directories) - + all_filepaths = filepaths + filepaths_in_directories # currently, the files are converted sequentially to gently handle API failures documents = [] meta_list = normalize_metadata(meta, sources_count=len(all_filepaths)) From 7358a0af381f54f1cbf5239cf0719892d63e4536 Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 5 Feb 2024 09:41:52 +0100 Subject: [PATCH 16/17] Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py Co-authored-by: Stefano Fiorucci --- .../components/converters/unstructured/converter.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index 84143d557..675514c83 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -114,10 +114,11 @@ def run( filepaths_in_directories = [ filepath for path in paths_obj if path.is_dir() for filepath in path.glob("*.*") if filepath.is_file() ] - if filepaths_in_directories != [] and isinstance(meta, list): - error = """For directories as path metadata can only be a unique dictionary, not a list. - To provide different metadata (list) for each files, - please provide an explicit list of direct paths instead.""" + if filepaths_in_directories and isinstance(meta, list): + error = """"If providing directories in the `paths` parameter, + `meta` can only be a dictionary (metadata applied to every file), + and not a list. To specify different metadata for each file, + provide an explicit list of direct paths instead.""" raise ValueError(error) all_filepaths = filepaths + filepaths_in_directories From 534d1b13c3f049b5390041d87056c2ad6b563dc4 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Mon, 5 Feb 2024 09:55:59 +0100 Subject: [PATCH 17/17] update meta docstrings --- .../components/converters/unstructured/converter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index 675514c83..188dd9e6e 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -105,8 +105,8 @@ def run( This value can be either a list of dictionaries or a single dictionary. If it's a single dictionary, its content is added to the metadata of all produced Documents. If it's a list, the length of the list must match the number of paths, because the two lists will be zipped. - Please note that if the paths contain directories, the length of the meta list must match - the actual number of files contained. + Please note that if the paths contain directories, meta can only be a single dictionary + (same metadata for all files). Defaults to `None`. """ paths_obj = [Path(path) for path in paths]