diff --git a/packages/bigframes/tests/data/nested_structs.jsonl b/packages/bigframes/tests/data/nested_structs.jsonl index f57214b0b3c6..97e230c91974 100644 --- a/packages/bigframes/tests/data/nested_structs.jsonl +++ b/packages/bigframes/tests/data/nested_structs.jsonl @@ -1,2 +1,6 @@ -{"id": 1, "person": {"name": "Alice", "age":30, "address": {"city": "New York", "country": "USA"}}} -{"id": 2, "person": {"name": "Bob", "age":25, "address": {"city": "London", "country": "UK"}}} \ No newline at end of file +{"id": 1, "person": {"name": "Alice", "age": 30, "address": {"city": "New York", "country": "USA"}}, "bool_col": true, "int64_col": "123456789", "float64_col": 1.25, "string_col": "Hello World", "json_col": {"a": 1, "b": [1, 2]}, "date_col": "2026-06-24", "time_col": "12:34:56.789012", "datetime_col": "2026-06-24 12:34:56.789012", "timestamp_col": "2026-06-24T12:34:56.789012Z", "bytes_col": "SGVsbG8=", "numeric_col": "123456.789", "bignumeric_col": "123456.7890123456789", "geography_col": "POINT(30 10)", "duration_col": "1000"} +{"id": 2, "person": {"name": "", "age": -1, "address": {"city": "", "country": ""}}, "bool_col": false, "int64_col": "-9223372036854775808", "float64_col": "-Infinity", "string_col": "", "json_col": {}, "date_col": "0001-01-01", "time_col": "00:00:00", "datetime_col": "0001-01-02 00:00:00", "timestamp_col": "0001-01-02T00:00:00Z", "bytes_col": "", "numeric_col": "-99999999999999999999999999999.999999999", "bignumeric_col": "-99999999999999999999999999999999999999.99999999999999999999999999999999999999", "geography_col": "POINT(0 0)", "duration_col": "-9223372036854775"} +{"id": 3, "person": {"name": "Very Long Name...", "age": 150, "address": {"city": "City", "country": "Country"}}, "bool_col": true, "int64_col": "9223372036854775807", "float64_col": "Infinity", "string_col": "Unicode: 🚀 Spark ✨", "json_col": {"max": true, "nested": {"val": 999}}, "date_col": "9999-12-31", "time_col": "23:59:59.999999", "datetime_col": "9999-12-31 23:59:59.999999", "timestamp_col": "9999-12-31T23:59:59.999999Z", "bytes_col": "dmVyeSBsb25nIGJ5dGVzIHZhbHVl", "numeric_col": "99999999999999999999999999999.999999999", "bignumeric_col": "99999999999999999999999999999999999999.99999999999999999999999999999999999999", "geography_col": "POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))", "duration_col": "9223372036854775"} +{"id": 4, "person": null, "bool_col": null, "int64_col": null, "float64_col": null, "string_col": null, "date_col": null, "time_col": null, "datetime_col": null, "timestamp_col": null, "bytes_col": null, "numeric_col": null, "bignumeric_col": null, "geography_col": null, "duration_col": null} +{"id": 5, "person": {"name": "Bob", "age": 0, "address": null}, "bool_col": false, "int64_col": "0", "float64_col": "NaN", "string_col": "Line 1\nLine 2\n\"Quotes\"", "json_col": [1, "two", null], "date_col": "1970-01-01", "time_col": "12:00:00", "datetime_col": "1970-01-01 12:00:00", "timestamp_col": "1970-01-01T12:00:00Z", "bytes_col": "AA==", "numeric_col": "0", "bignumeric_col": "0", "geography_col": "LINESTRING(0 0, 1 1, 2 2)", "duration_col": "0"} +{"id": 6, "person": null, "bool_col": null, "int64_col": null, "float64_col": null, "string_col": null, "json_col": null, "date_col": null, "time_col": null, "datetime_col": null, "timestamp_col": null, "bytes_col": null, "numeric_col": null, "bignumeric_col": null, "geography_col": null, "duration_col": null} diff --git a/packages/bigframes/tests/data/nested_structs_schema.json b/packages/bigframes/tests/data/nested_structs_schema.json index 6692615ceffa..06e4a3e5275c 100644 --- a/packages/bigframes/tests/data/nested_structs_schema.json +++ b/packages/bigframes/tests/data/nested_structs_schema.json @@ -7,6 +7,7 @@ { "name": "person", "type": "RECORD", + "mode": "NULLABLE", "fields": [ { "name": "name", @@ -21,6 +22,7 @@ { "name": "address", "type": "RECORD", + "mode": "NULLABLE", "fields": [ { "name": "city", @@ -35,5 +37,76 @@ ] } ] + }, + { + "name": "bool_col", + "type": "BOOLEAN", + "mode": "NULLABLE" + }, + { + "name": "int64_col", + "type": "INTEGER", + "mode": "NULLABLE" + }, + { + "name": "float64_col", + "type": "FLOAT", + "mode": "NULLABLE" + }, + { + "name": "string_col", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "json_col", + "type": "JSON", + "mode": "NULLABLE" + }, + { + "name": "date_col", + "type": "DATE", + "mode": "NULLABLE" + }, + { + "name": "time_col", + "type": "TIME", + "mode": "NULLABLE" + }, + { + "name": "datetime_col", + "type": "DATETIME", + "mode": "NULLABLE" + }, + { + "name": "timestamp_col", + "type": "TIMESTAMP", + "mode": "NULLABLE" + }, + { + "name": "bytes_col", + "type": "BYTES", + "mode": "NULLABLE" + }, + { + "name": "numeric_col", + "type": "NUMERIC", + "mode": "NULLABLE" + }, + { + "name": "bignumeric_col", + "type": "BIGNUMERIC", + "mode": "NULLABLE" + }, + { + "name": "geography_col", + "type": "GEOGRAPHY", + "mode": "NULLABLE" + }, + { + "name": "duration_col", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "#microseconds" } ] diff --git a/packages/bigframes/tests/system/conftest.py b/packages/bigframes/tests/system/conftest.py index f6fbdd0c510d..230a39a29f87 100644 --- a/packages/bigframes/tests/system/conftest.py +++ b/packages/bigframes/tests/system/conftest.py @@ -496,14 +496,221 @@ def nested_structs_df( @pytest.fixture(scope="session") def nested_structs_pandas_df(nested_structs_pandas_type: pd.ArrowDtype) -> pd.DataFrame: - """pd.DataFrame pointing at test data.""" + """pd.DataFrame pointing at test data. + + Manually parses using json.loads to preserve data types. + """ + import base64 + import datetime + import decimal + import json + + import db_dtypes + import geopandas as gpd + + with open(DATA_DIR / "nested_structs.jsonl") as f: + raw_rows = [json.loads(line) for line in f] + + ids = [row["id"] for row in raw_rows] + + def get_val(row, col_name): + return row.get(col_name) + + # person + person_struct_schema = nested_structs_pandas_type.pyarrow_dtype + processed_person = [] + for row in raw_rows: + x = get_val(row, "person") + if x is None: + processed_person.append(None) + else: + d = dict(x) + if "age" in d and d["age"] is not None: + d["age"] = int(d["age"]) + processed_person.append(d) + person_arr = pa.array(processed_person, type=person_struct_schema) + person_ser = pd.Series(person_arr, index=ids, dtype=nested_structs_pandas_type) + + # bool_col + bool_vals = [ + bool(get_val(row, "bool_col")) if get_val(row, "bool_col") is not None else None + for row in raw_rows + ] + bool_ser = pd.Series(bool_vals, index=ids, dtype=pd.BooleanDtype()) + + # int64_col + int64_vals = [ + int(get_val(row, "int64_col")) + if get_val(row, "int64_col") is not None + else None + for row in raw_rows + ] + int64_ser = pd.Series(int64_vals, index=ids, dtype=pd.Int64Dtype()) + + # float64_col + float64_vals = [ + float(get_val(row, "float64_col")) + if get_val(row, "float64_col") is not None + else None + for row in raw_rows + ] + arr = pa.array(float64_vals, type=pa.float64()) + mask = pa.compute.is_null(arr) + nonnull = pa.compute.fill_null(arr, float("nan")) + pd_array = pd.arrays.FloatingArray( + nonnull.to_numpy(zero_copy_only=False), + mask.to_numpy(zero_copy_only=False), + ) + float64_ser = pd.Series(pd_array, index=ids, dtype=pd.Float64Dtype()) + + # string_col + string_vals = [ + str(get_val(row, "string_col")) + if get_val(row, "string_col") is not None + else None + for row in raw_rows + ] + string_ser = pd.Series( + string_vals, index=ids, dtype=pd.StringDtype(storage="pyarrow") + ) - df = pd.read_json( - DATA_DIR / "nested_structs.jsonl", - lines=True, + # json_col + json_strs = [] + for row in raw_rows: + if "json_col" not in row: + json_strs.append(None) + elif row["json_col"] is None: + json_strs.append("null") + else: + json_strs.append( + json.dumps(row["json_col"], sort_keys=True, separators=(",", ":")) + ) + json_arr = pa.array(json_strs, type=db_dtypes.JSONArrowType()) + json_ser = pd.Series( + json_arr, index=ids, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()) + ) + + # date_col + date_vals = [ + datetime.date.fromisoformat(get_val(row, "date_col")) + if get_val(row, "date_col") is not None + else None + for row in raw_rows + ] + date_arr = pa.array(date_vals, type=pa.date32()) + date_ser = pd.Series(date_arr, index=ids, dtype=pd.ArrowDtype(pa.date32())) + + # time_col + time_vals = [ + datetime.time.fromisoformat(get_val(row, "time_col")) + if get_val(row, "time_col") is not None + else None + for row in raw_rows + ] + time_arr = pa.array(time_vals, type=pa.time64("us")) + time_ser = pd.Series(time_arr, index=ids, dtype=pd.ArrowDtype(pa.time64("us"))) + + # datetime_col + datetime_vals = [] + for row in raw_rows: + val = get_val(row, "datetime_col") + if val is None: + datetime_vals.append(None) + else: + datetime_vals.append(datetime.datetime.fromisoformat(val.replace(" ", "T"))) + datetime_arr = pa.array(datetime_vals, type=pa.timestamp("us")) + datetime_ser = pd.Series( + datetime_arr, index=ids, dtype=pd.ArrowDtype(pa.timestamp("us")) + ) + + # timestamp_col + timestamp_vals = [ + datetime.datetime.fromisoformat(get_val(row, "timestamp_col")) + if get_val(row, "timestamp_col") is not None + else None + for row in raw_rows + ] + timestamp_arr = pa.array(timestamp_vals, type=pa.timestamp("us", tz="UTC")) + timestamp_ser = pd.Series( + timestamp_arr, index=ids, dtype=pd.ArrowDtype(pa.timestamp("us", tz="UTC")) + ) + + # bytes_col + bytes_vals = [] + for row in raw_rows: + val = get_val(row, "bytes_col") + if val is None: + bytes_vals.append(None) + elif val == "": + bytes_vals.append(b"") + else: + bytes_vals.append(base64.b64decode(val)) + bytes_arr = pa.array(bytes_vals, type=pa.binary()) + bytes_ser = pd.Series(bytes_arr, index=ids, dtype=pd.ArrowDtype(pa.binary())) + + # numeric_col + numeric_vals = [ + decimal.Decimal(str(get_val(row, "numeric_col"))) + if get_val(row, "numeric_col") is not None + else None + for row in raw_rows + ] + numeric_arr = pa.array(numeric_vals, type=pa.decimal128(38, 9)) + numeric_ser = pd.Series( + numeric_arr, index=ids, dtype=pd.ArrowDtype(pa.decimal128(38, 9)) + ) + + # bignumeric_col + bignumeric_vals = [ + decimal.Decimal(str(get_val(row, "bignumeric_col"))) + if get_val(row, "bignumeric_col") is not None + else None + for row in raw_rows + ] + bignumeric_arr = pa.array(bignumeric_vals, type=pa.decimal256(76, 38)) + bignumeric_ser = pd.Series( + bignumeric_arr, index=ids, dtype=pd.ArrowDtype(pa.decimal256(76, 38)) + ) + + # geography_col + geo_vals = [get_val(row, "geography_col") for row in raw_rows] + geo_ser = gpd.GeoSeries.from_wkt(geo_vals) + geo_ser.index = ids + + # duration_col + duration_vals = [ + int(get_val(row, "duration_col")) + if get_val(row, "duration_col") is not None + else None + for row in raw_rows + ] + duration_arr = pa.array(duration_vals, type=pa.duration("us")) + duration_ser = pd.Series( + duration_arr, index=ids, dtype=pd.ArrowDtype(pa.duration("us")) + ) + + df = pd.DataFrame( + { + "person": person_ser, + "bool_col": bool_ser, + "int64_col": int64_ser, + "float64_col": float64_ser, + "string_col": string_ser, + "json_col": json_ser, + "date_col": date_ser, + "time_col": time_ser, + "datetime_col": datetime_ser, + "timestamp_col": timestamp_ser, + "bytes_col": bytes_ser, + "numeric_col": numeric_ser, + "bignumeric_col": bignumeric_ser, + "geography_col": geo_ser, + "duration_col": duration_ser, + }, + index=ids, ) - df = df.set_index("id") - df["person"] = df["person"].astype(nested_structs_pandas_type) + df.index.name = "id" + return df diff --git a/packages/bigframes/tests/system/small/test_pandas.py b/packages/bigframes/tests/system/small/test_pandas.py index 7581557b0b33..356e498021b9 100644 --- a/packages/bigframes/tests/system/small/test_pandas.py +++ b/packages/bigframes/tests/system/small/test_pandas.py @@ -51,6 +51,84 @@ def test_concat_dataframe_w_struct_cols(nested_structs_df, nested_structs_pandas pd.testing.assert_frame_equal(bf_result, pd_result) +def test_nested_structs_dtypes_and_edge_cases(nested_structs_df): + """Explicitly verify dtypes and edge case values for all supported types.""" + import datetime as dt + import decimal + + import numpy as np + import pandas as pd + + import bigframes.dtypes as bfd + + # 1. Verify BigFrames dtypes + expected_bf_dtypes = { + "person": nested_structs_df["person"].dtype, + "bool_col": bfd.BOOL_DTYPE, + "int64_col": bfd.INT_DTYPE, + "float64_col": bfd.FLOAT_DTYPE, + "string_col": bfd.STRING_DTYPE, + "json_col": bfd.JSON_DTYPE, + "date_col": bfd.DATE_DTYPE, + "time_col": bfd.TIME_DTYPE, + "datetime_col": bfd.DATETIME_DTYPE, + "timestamp_col": bfd.TIMESTAMP_DTYPE, + "bytes_col": bfd.BYTES_DTYPE, + "numeric_col": bfd.NUMERIC_DTYPE, + "bignumeric_col": bfd.BIGNUMERIC_DTYPE, + "geography_col": bfd.GEO_DTYPE, + "duration_col": bfd.TIMEDELTA_DTYPE, + } + + for col_name, expected_dtype in expected_bf_dtypes.items(): + assert nested_structs_df[col_name].dtype == expected_dtype, ( + f"Dtype mismatch for {col_name}" + ) + + # 2. Convert to pandas for value assertions + pd_df = nested_structs_df.to_pandas() + + # Verify we have 6 rows + assert len(pd_df) == 6 + + # Row 1: Normal typical values + assert pd_df.loc[1, "bool_col"] == True + assert pd_df.loc[1, "int64_col"] == 123456789 + assert pd_df.loc[1, "float64_col"] == 1.25 + assert pd_df.loc[1, "string_col"] == "Hello World" + assert pd_df.loc[1, "json_col"] == '{"a":1,"b":[1,2]}' + assert pd_df.loc[1, "date_col"] == dt.date(2026, 6, 24) + + # Row 2: Min bounds / negative infinity + assert pd_df.loc[2, "int64_col"] == -9223372036854775808 + assert pd_df.loc[2, "float64_col"] == float("-inf") + assert pd_df.loc[2, "numeric_col"] == decimal.Decimal( + "-99999999999999999999999999999.999999999" + ) + + # Row 3: Max bounds / infinity + assert pd_df.loc[3, "int64_col"] == 9223372036854775807 + assert pd_df.loc[3, "float64_col"] == float("inf") + + # Row 4: SQL NULLs (omitted keys) + assert pd.isna(pd_df.loc[4, "bool_col"]) + assert pd.isna(pd_df.loc[4, "int64_col"]) + assert pd.isna(pd_df.loc[4, "float64_col"]) + assert pd.isna(pd_df.loc[4, "json_col"]) + assert pd.isna(pd_df.loc[4, "geography_col"]) + + # Row 5: Special edge cases (NaN, empty, multiline) + assert np.isnan(pd_df.loc[5, "float64_col"]) + assert pd_df.loc[5, "float64_col"] is not pd.NA + assert not pd_df["float64_col"].isna().loc[5] + assert pd_df.loc[5, "string_col"] == 'Line 1\nLine 2\n"Quotes"' + assert pd_df.loc[5, "bytes_col"] == b"\x00" + + # Row 6: JSON null literal + assert pd_df.loc[6, "json_col"] == "null" + assert not pd_df["json_col"].isna().loc[6] + + def test_concat_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = bpd.concat(