Skip to content

Commit

Permalink
Merge branch 'main' into add-recursive-chunking
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista committed Dec 12, 2024
2 parents 3cb85d9 + 3f77d3a commit 459bfa7
Show file tree
Hide file tree
Showing 19 changed files with 648 additions and 133 deletions.
7 changes: 0 additions & 7 deletions haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import copy
import hashlib
import os
import warnings
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Union
Expand Down Expand Up @@ -143,12 +142,6 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
azure_output.append(result.to_dict())

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
8 changes: 0 additions & 8 deletions haystack/components/converters/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -94,13 +93,6 @@ def run(

merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
Expand Down
8 changes: 0 additions & 8 deletions haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import csv
import io
import os
import warnings
from dataclasses import dataclass
from enum import Enum
from io import StringIO
Expand Down Expand Up @@ -189,13 +188,6 @@ def run(
)
continue

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

docx_metadata = self._get_docx_metadata(document=docx_document)
merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}

Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -123,12 +122,6 @@ def run(

merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)
if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import json
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union

Expand Down Expand Up @@ -280,12 +279,6 @@ def run(

data = self._get_content_and_meta(bytestream)

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)
for text, extra_meta in data:
merged_metadata = {**bytestream.meta, **metadata, **extra_meta}

Expand Down
8 changes: 0 additions & 8 deletions haystack/components/converters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -112,13 +111,6 @@ def run(

merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)

Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -172,12 +171,6 @@ def run(
)

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -104,12 +103,6 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
8 changes: 1 addition & 7 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -220,12 +219,7 @@ def run(
)

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
document.meta = merged_metadata
Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from html.parser import HTMLParser
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -139,12 +138,6 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -93,12 +92,6 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
Loading

0 comments on commit 459bfa7

Please sign in to comment.