Skip to content

Commit

Permalink
Restructure logic to minimize the number of file system accesses
Browse files Browse the repository at this point in the history
This also introduces a method that uses a glob to find all version folders instead of listing everything in a dir and then doing is_dir on all of them.

PiperOrigin-RevId: 697562330
  • Loading branch information
tomvdw authored and The TensorFlow Datasets Authors committed Nov 18, 2024
1 parent 0ca4911 commit ca2973c
Show file tree
Hide file tree
Showing 7 changed files with 271 additions and 231 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for conll_dataset_builder."""
import textwrap
from unittest import mock

from etils import epath
import pytest
Expand All @@ -25,28 +23,22 @@

_FOLDER_PATH = "mock/path"

_VALID_INPUT = textwrap.dedent(
"""
_VALID_INPUT = textwrap.dedent("""
-DOCSTART- -X- -X- O
Winter NN B-NP O
is VBZ B-VP O
Air NN I-NP O
. . O O
"""
)
""")

_INVALID_INPUT = textwrap.dedent(
"""
_INVALID_INPUT = textwrap.dedent("""
Winter NN B-NP
is VBZ B-VP O
Air NN I-NP O
. . O O
"""
)

_INPUT_PATH = epath.Path(_FOLDER_PATH, "input_path.txt")
""")


class DummyConllDataset(conll_dataset_builder.ConllDatasetBuilder):
Expand All @@ -63,60 +55,56 @@ def _info(self) -> tfds.core.DatasetInfo:
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
"""Returns SplitGenerators."""
del dl_manager
return {"train": self._generate_examples(_INPUT_PATH)}


def test_generate_example():
tf_mock = mock.Mock()
tf_mock.gfile.GFile.return_value = _VALID_INPUT
expected_examples = []

dataset = DummyConllDataset()

with tfds.testing.MockFs() as fs:
fs.add_file(path=_INPUT_PATH, content=_VALID_INPUT)
examples = list(dataset._generate_examples(_INPUT_PATH))

expected_examples = [
(
0,
{
"tokens": ["Winter", "is"],
"pos": ["NN", "VBZ"],
"chunks": ["B-NP", "B-VP"],
"ner": ["O", "O"],
},
),
(
1,
{
"tokens": ["Air", "."],
"pos": ["NN", "."],
"chunks": ["I-NP", "O"],
"ner": ["O", "O"],
},
),
]

assert examples == expected_examples

for _, example in examples:
assert len(example) == len(conll_lib.CONLL_2003_ORDERED_FEATURES)
return {"train": self._generate_examples("/tmp/input.txt")}


def test_generate_example(tmpdir):
tmpdir = epath.Path(tmpdir)
input_path = tmpdir / "input_path.txt"
input_path.write_text(_VALID_INPUT)

dataset = DummyConllDataset(data_dir=tmpdir)
examples = list(dataset._generate_examples(input_path))

expected_examples = [
(
0,
{
"tokens": ["Winter", "is"],
"pos": ["NN", "VBZ"],
"chunks": ["B-NP", "B-VP"],
"ner": ["O", "O"],
},
),
(
1,
{
"tokens": ["Air", "."],
"pos": ["NN", "."],
"chunks": ["I-NP", "O"],
"ner": ["O", "O"],
},
),
]

assert examples == expected_examples

for _, example in examples:
assert len(example) == len(conll_lib.CONLL_2003_ORDERED_FEATURES)

assert len(examples) == 2


def test_generate_corrupted_example():
tf_mock = mock.Mock()
tf_mock.gfile.GFile.return_value = _VALID_INPUT
dataset = DummyConllDataset()
def test_generate_corrupted_example(tmpdir):
tmpdir = epath.Path(tmpdir)
input_path = tmpdir / "input_path.txt"
input_path.write_text(_INVALID_INPUT)
dataset = DummyConllDataset(data_dir=tmpdir)

error_line = "Winter NN B-NP"
error_msg = (
f"Mismatch in the number of features found in line: {error_line}\n\n"
"Should be 4, but found 3"
)
with pytest.raises(ValueError, match=error_msg):
with tfds.testing.MockFs() as fs:
fs.add_file(path=_INPUT_PATH, content=_INVALID_INPUT)
list(dataset._generate_examples(_INPUT_PATH))
list(dataset._generate_examples(input_path))
Loading

0 comments on commit ca2973c

Please sign in to comment.