Skip to content

Commit

Permalink
Merge pull request #32303 Preserve numeric string literals when readi…
Browse files Browse the repository at this point in the history
…ng from json.
  • Loading branch information
robertwb authored Aug 27, 2024
2 parents 857ecce + 679e5cc commit bc80e9f
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 2 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@

## Breaking Changes

* In Python and YAML, ReadFromJson now override the dtype from None to
an explicit False. Most notably, string values like `"123"` are preserved
as strings rather than silently coerced (and possibly truncated) to numeric
values. To retain the old behavior, pass `dtype=True` (or any other value
accepted by `pandas.read_json`).
* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)).

## Deprecations
Expand Down
14 changes: 12 additions & 2 deletions sdks/python/apache_beam/io/textio.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
from functools import partial
from typing import TYPE_CHECKING
from typing import Any
from typing import Dict
from typing import Optional
from typing import Union

from apache_beam import typehints
from apache_beam.coders import coders
Expand Down Expand Up @@ -980,7 +982,12 @@ def WriteToCsv(

@append_pandas_args(pandas.read_json, exclude=['path_or_buf'])
def ReadFromJson(
path: str, *, orient: str = 'records', lines: bool = True, **kwargs):
path: str,
*,
orient: str = 'records',
lines: bool = True,
dtype: Union[bool, Dict[str, Any]] = False,
**kwargs):
"""A PTransform for reading json values from files into a PCollection.
Args:
Expand All @@ -992,11 +999,14 @@ def ReadFromJson(
lines (bool): Whether each line should be considered a separate record,
as opposed to the entire file being a valid JSON object or list.
Defaults to True (unlike Pandas).
dtype (bool): If True, infer dtypes; if a dict of column to dtype,
then use those; if False, then don’t infer dtypes at all.
Defaults to False (unlike Pandas).
**kwargs: Extra arguments passed to `pandas.read_json` (see below).
"""
from apache_beam.dataframe.io import ReadViaPandas
return 'ReadFromJson' >> ReadViaPandas(
'json', path, orient=orient, lines=lines, **kwargs)
'json', path, orient=orient, lines=lines, dtype=dtype, **kwargs)

@append_pandas_args(
pandas.DataFrame.to_json, exclude=['path_or_buf', 'index'])
Expand Down
30 changes: 30 additions & 0 deletions sdks/python/apache_beam/io/textio_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1743,6 +1743,36 @@ def test_json_read_write(self):

assert_that(pcoll, equal_to(records))

def test_numeric_strings_preserved(self):
records = [
beam.Row(
as_string=str(ix),
as_float_string=str(float(ix)),
as_int=ix,
as_float=float(ix)) for ix in range(3)
]
with tempfile.TemporaryDirectory() as dest:
with TestPipeline() as p:
# pylint: disable=expression-not-assigned
p | beam.Create(records) | beam.io.WriteToJson(
os.path.join(dest, 'out'))
with TestPipeline() as p:
pcoll = (
p
| beam.io.ReadFromJson(os.path.join(dest, 'out*'))
| beam.Map(lambda t: beam.Row(**dict(zip(type(t)._fields, t)))))

assert_that(pcoll, equal_to(records))

# This test should be redundant as Python equality does not equate
# numeric values with their string representations, but this is much
# more explicit about what we're asserting here.
def check_types(element):
for a, b in zip(element, records[0]):
assert type(a) == type(b), (a, b, type(a), type(b))

_ = pcoll | beam.Map(check_types)


if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
Expand Down

0 comments on commit bc80e9f

Please sign in to comment.