From d03864c1090ecd5eb2b697ffaeb78bf70c13e283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 24 Jun 2026 21:31:09 +0000 Subject: [PATCH 1/7] chore: add test of all supported BigQuery data types This should be helpful in more thorough testing of the new compiler. Towards b/527509188 --- .../bigframes/tests/data/nested_structs.jsonl | 8 +- .../tests/data/nested_structs_schema.json | 73 ++++++ packages/bigframes/tests/system/conftest.py | 219 +++++++++++++++++- .../tests/system/small/test_pandas.py | 78 +++++++ 4 files changed, 370 insertions(+), 8 deletions(-) diff --git a/packages/bigframes/tests/data/nested_structs.jsonl b/packages/bigframes/tests/data/nested_structs.jsonl index f57214b0b3c6..97e230c91974 100644 --- a/packages/bigframes/tests/data/nested_structs.jsonl +++ b/packages/bigframes/tests/data/nested_structs.jsonl @@ -1,2 +1,6 @@ -{"id": 1, "person": {"name": "Alice", "age":30, "address": {"city": "New York", "country": "USA"}}} -{"id": 2, "person": {"name": "Bob", "age":25, "address": {"city": "London", "country": "UK"}}} \ No newline at end of file +{"id": 1, "person": {"name": "Alice", "age": 30, "address": {"city": "New York", "country": "USA"}}, "bool_col": true, "int64_col": "123456789", "float64_col": 1.25, "string_col": "Hello World", "json_col": {"a": 1, "b": [1, 2]}, "date_col": "2026-06-24", "time_col": "12:34:56.789012", "datetime_col": "2026-06-24 12:34:56.789012", "timestamp_col": "2026-06-24T12:34:56.789012Z", "bytes_col": "SGVsbG8=", "numeric_col": "123456.789", "bignumeric_col": "123456.7890123456789", "geography_col": "POINT(30 10)", "duration_col": "1000"} +{"id": 2, "person": {"name": "", "age": -1, "address": {"city": "", "country": ""}}, "bool_col": false, "int64_col": "-9223372036854775808", "float64_col": "-Infinity", "string_col": "", "json_col": {}, "date_col": "0001-01-01", "time_col": "00:00:00", "datetime_col": "0001-01-02 00:00:00", "timestamp_col": "0001-01-02T00:00:00Z", "bytes_col": "", "numeric_col": "-99999999999999999999999999999.999999999", "bignumeric_col": "-99999999999999999999999999999999999999.99999999999999999999999999999999999999", "geography_col": "POINT(0 0)", "duration_col": "-9223372036854775"} +{"id": 3, "person": {"name": "Very Long Name...", "age": 150, "address": {"city": "City", "country": "Country"}}, "bool_col": true, "int64_col": "9223372036854775807", "float64_col": "Infinity", "string_col": "Unicode: 🚀 Spark ✨", "json_col": {"max": true, "nested": {"val": 999}}, "date_col": "9999-12-31", "time_col": "23:59:59.999999", "datetime_col": "9999-12-31 23:59:59.999999", "timestamp_col": "9999-12-31T23:59:59.999999Z", "bytes_col": "dmVyeSBsb25nIGJ5dGVzIHZhbHVl", "numeric_col": "99999999999999999999999999999.999999999", "bignumeric_col": "99999999999999999999999999999999999999.99999999999999999999999999999999999999", "geography_col": "POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))", "duration_col": "9223372036854775"} +{"id": 4, "person": null, "bool_col": null, "int64_col": null, "float64_col": null, "string_col": null, "date_col": null, "time_col": null, "datetime_col": null, "timestamp_col": null, "bytes_col": null, "numeric_col": null, "bignumeric_col": null, "geography_col": null, "duration_col": null} +{"id": 5, "person": {"name": "Bob", "age": 0, "address": null}, "bool_col": false, "int64_col": "0", "float64_col": "NaN", "string_col": "Line 1\nLine 2\n\"Quotes\"", "json_col": [1, "two", null], "date_col": "1970-01-01", "time_col": "12:00:00", "datetime_col": "1970-01-01 12:00:00", "timestamp_col": "1970-01-01T12:00:00Z", "bytes_col": "AA==", "numeric_col": "0", "bignumeric_col": "0", "geography_col": "LINESTRING(0 0, 1 1, 2 2)", "duration_col": "0"} +{"id": 6, "person": null, "bool_col": null, "int64_col": null, "float64_col": null, "string_col": null, "json_col": null, "date_col": null, "time_col": null, "datetime_col": null, "timestamp_col": null, "bytes_col": null, "numeric_col": null, "bignumeric_col": null, "geography_col": null, "duration_col": null} diff --git a/packages/bigframes/tests/data/nested_structs_schema.json b/packages/bigframes/tests/data/nested_structs_schema.json index 6692615ceffa..06e4a3e5275c 100644 --- a/packages/bigframes/tests/data/nested_structs_schema.json +++ b/packages/bigframes/tests/data/nested_structs_schema.json @@ -7,6 +7,7 @@ { "name": "person", "type": "RECORD", + "mode": "NULLABLE", "fields": [ { "name": "name", @@ -21,6 +22,7 @@ { "name": "address", "type": "RECORD", + "mode": "NULLABLE", "fields": [ { "name": "city", @@ -35,5 +37,76 @@ ] } ] + }, + { + "name": "bool_col", + "type": "BOOLEAN", + "mode": "NULLABLE" + }, + { + "name": "int64_col", + "type": "INTEGER", + "mode": "NULLABLE" + }, + { + "name": "float64_col", + "type": "FLOAT", + "mode": "NULLABLE" + }, + { + "name": "string_col", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "json_col", + "type": "JSON", + "mode": "NULLABLE" + }, + { + "name": "date_col", + "type": "DATE", + "mode": "NULLABLE" + }, + { + "name": "time_col", + "type": "TIME", + "mode": "NULLABLE" + }, + { + "name": "datetime_col", + "type": "DATETIME", + "mode": "NULLABLE" + }, + { + "name": "timestamp_col", + "type": "TIMESTAMP", + "mode": "NULLABLE" + }, + { + "name": "bytes_col", + "type": "BYTES", + "mode": "NULLABLE" + }, + { + "name": "numeric_col", + "type": "NUMERIC", + "mode": "NULLABLE" + }, + { + "name": "bignumeric_col", + "type": "BIGNUMERIC", + "mode": "NULLABLE" + }, + { + "name": "geography_col", + "type": "GEOGRAPHY", + "mode": "NULLABLE" + }, + { + "name": "duration_col", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "#microseconds" } ] diff --git a/packages/bigframes/tests/system/conftest.py b/packages/bigframes/tests/system/conftest.py index f6fbdd0c510d..230a39a29f87 100644 --- a/packages/bigframes/tests/system/conftest.py +++ b/packages/bigframes/tests/system/conftest.py @@ -496,14 +496,221 @@ def nested_structs_df( @pytest.fixture(scope="session") def nested_structs_pandas_df(nested_structs_pandas_type: pd.ArrowDtype) -> pd.DataFrame: - """pd.DataFrame pointing at test data.""" + """pd.DataFrame pointing at test data. + + Manually parses using json.loads to preserve data types. + """ + import base64 + import datetime + import decimal + import json + + import db_dtypes + import geopandas as gpd + + with open(DATA_DIR / "nested_structs.jsonl") as f: + raw_rows = [json.loads(line) for line in f] + + ids = [row["id"] for row in raw_rows] + + def get_val(row, col_name): + return row.get(col_name) + + # person + person_struct_schema = nested_structs_pandas_type.pyarrow_dtype + processed_person = [] + for row in raw_rows: + x = get_val(row, "person") + if x is None: + processed_person.append(None) + else: + d = dict(x) + if "age" in d and d["age"] is not None: + d["age"] = int(d["age"]) + processed_person.append(d) + person_arr = pa.array(processed_person, type=person_struct_schema) + person_ser = pd.Series(person_arr, index=ids, dtype=nested_structs_pandas_type) + + # bool_col + bool_vals = [ + bool(get_val(row, "bool_col")) if get_val(row, "bool_col") is not None else None + for row in raw_rows + ] + bool_ser = pd.Series(bool_vals, index=ids, dtype=pd.BooleanDtype()) + + # int64_col + int64_vals = [ + int(get_val(row, "int64_col")) + if get_val(row, "int64_col") is not None + else None + for row in raw_rows + ] + int64_ser = pd.Series(int64_vals, index=ids, dtype=pd.Int64Dtype()) + + # float64_col + float64_vals = [ + float(get_val(row, "float64_col")) + if get_val(row, "float64_col") is not None + else None + for row in raw_rows + ] + arr = pa.array(float64_vals, type=pa.float64()) + mask = pa.compute.is_null(arr) + nonnull = pa.compute.fill_null(arr, float("nan")) + pd_array = pd.arrays.FloatingArray( + nonnull.to_numpy(zero_copy_only=False), + mask.to_numpy(zero_copy_only=False), + ) + float64_ser = pd.Series(pd_array, index=ids, dtype=pd.Float64Dtype()) + + # string_col + string_vals = [ + str(get_val(row, "string_col")) + if get_val(row, "string_col") is not None + else None + for row in raw_rows + ] + string_ser = pd.Series( + string_vals, index=ids, dtype=pd.StringDtype(storage="pyarrow") + ) - df = pd.read_json( - DATA_DIR / "nested_structs.jsonl", - lines=True, + # json_col + json_strs = [] + for row in raw_rows: + if "json_col" not in row: + json_strs.append(None) + elif row["json_col"] is None: + json_strs.append("null") + else: + json_strs.append( + json.dumps(row["json_col"], sort_keys=True, separators=(",", ":")) + ) + json_arr = pa.array(json_strs, type=db_dtypes.JSONArrowType()) + json_ser = pd.Series( + json_arr, index=ids, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()) + ) + + # date_col + date_vals = [ + datetime.date.fromisoformat(get_val(row, "date_col")) + if get_val(row, "date_col") is not None + else None + for row in raw_rows + ] + date_arr = pa.array(date_vals, type=pa.date32()) + date_ser = pd.Series(date_arr, index=ids, dtype=pd.ArrowDtype(pa.date32())) + + # time_col + time_vals = [ + datetime.time.fromisoformat(get_val(row, "time_col")) + if get_val(row, "time_col") is not None + else None + for row in raw_rows + ] + time_arr = pa.array(time_vals, type=pa.time64("us")) + time_ser = pd.Series(time_arr, index=ids, dtype=pd.ArrowDtype(pa.time64("us"))) + + # datetime_col + datetime_vals = [] + for row in raw_rows: + val = get_val(row, "datetime_col") + if val is None: + datetime_vals.append(None) + else: + datetime_vals.append(datetime.datetime.fromisoformat(val.replace(" ", "T"))) + datetime_arr = pa.array(datetime_vals, type=pa.timestamp("us")) + datetime_ser = pd.Series( + datetime_arr, index=ids, dtype=pd.ArrowDtype(pa.timestamp("us")) + ) + + # timestamp_col + timestamp_vals = [ + datetime.datetime.fromisoformat(get_val(row, "timestamp_col")) + if get_val(row, "timestamp_col") is not None + else None + for row in raw_rows + ] + timestamp_arr = pa.array(timestamp_vals, type=pa.timestamp("us", tz="UTC")) + timestamp_ser = pd.Series( + timestamp_arr, index=ids, dtype=pd.ArrowDtype(pa.timestamp("us", tz="UTC")) + ) + + # bytes_col + bytes_vals = [] + for row in raw_rows: + val = get_val(row, "bytes_col") + if val is None: + bytes_vals.append(None) + elif val == "": + bytes_vals.append(b"") + else: + bytes_vals.append(base64.b64decode(val)) + bytes_arr = pa.array(bytes_vals, type=pa.binary()) + bytes_ser = pd.Series(bytes_arr, index=ids, dtype=pd.ArrowDtype(pa.binary())) + + # numeric_col + numeric_vals = [ + decimal.Decimal(str(get_val(row, "numeric_col"))) + if get_val(row, "numeric_col") is not None + else None + for row in raw_rows + ] + numeric_arr = pa.array(numeric_vals, type=pa.decimal128(38, 9)) + numeric_ser = pd.Series( + numeric_arr, index=ids, dtype=pd.ArrowDtype(pa.decimal128(38, 9)) + ) + + # bignumeric_col + bignumeric_vals = [ + decimal.Decimal(str(get_val(row, "bignumeric_col"))) + if get_val(row, "bignumeric_col") is not None + else None + for row in raw_rows + ] + bignumeric_arr = pa.array(bignumeric_vals, type=pa.decimal256(76, 38)) + bignumeric_ser = pd.Series( + bignumeric_arr, index=ids, dtype=pd.ArrowDtype(pa.decimal256(76, 38)) + ) + + # geography_col + geo_vals = [get_val(row, "geography_col") for row in raw_rows] + geo_ser = gpd.GeoSeries.from_wkt(geo_vals) + geo_ser.index = ids + + # duration_col + duration_vals = [ + int(get_val(row, "duration_col")) + if get_val(row, "duration_col") is not None + else None + for row in raw_rows + ] + duration_arr = pa.array(duration_vals, type=pa.duration("us")) + duration_ser = pd.Series( + duration_arr, index=ids, dtype=pd.ArrowDtype(pa.duration("us")) + ) + + df = pd.DataFrame( + { + "person": person_ser, + "bool_col": bool_ser, + "int64_col": int64_ser, + "float64_col": float64_ser, + "string_col": string_ser, + "json_col": json_ser, + "date_col": date_ser, + "time_col": time_ser, + "datetime_col": datetime_ser, + "timestamp_col": timestamp_ser, + "bytes_col": bytes_ser, + "numeric_col": numeric_ser, + "bignumeric_col": bignumeric_ser, + "geography_col": geo_ser, + "duration_col": duration_ser, + }, + index=ids, ) - df = df.set_index("id") - df["person"] = df["person"].astype(nested_structs_pandas_type) + df.index.name = "id" + return df diff --git a/packages/bigframes/tests/system/small/test_pandas.py b/packages/bigframes/tests/system/small/test_pandas.py index 7581557b0b33..356e498021b9 100644 --- a/packages/bigframes/tests/system/small/test_pandas.py +++ b/packages/bigframes/tests/system/small/test_pandas.py @@ -51,6 +51,84 @@ def test_concat_dataframe_w_struct_cols(nested_structs_df, nested_structs_pandas pd.testing.assert_frame_equal(bf_result, pd_result) +def test_nested_structs_dtypes_and_edge_cases(nested_structs_df): + """Explicitly verify dtypes and edge case values for all supported types.""" + import datetime as dt + import decimal + + import numpy as np + import pandas as pd + + import bigframes.dtypes as bfd + + # 1. Verify BigFrames dtypes + expected_bf_dtypes = { + "person": nested_structs_df["person"].dtype, + "bool_col": bfd.BOOL_DTYPE, + "int64_col": bfd.INT_DTYPE, + "float64_col": bfd.FLOAT_DTYPE, + "string_col": bfd.STRING_DTYPE, + "json_col": bfd.JSON_DTYPE, + "date_col": bfd.DATE_DTYPE, + "time_col": bfd.TIME_DTYPE, + "datetime_col": bfd.DATETIME_DTYPE, + "timestamp_col": bfd.TIMESTAMP_DTYPE, + "bytes_col": bfd.BYTES_DTYPE, + "numeric_col": bfd.NUMERIC_DTYPE, + "bignumeric_col": bfd.BIGNUMERIC_DTYPE, + "geography_col": bfd.GEO_DTYPE, + "duration_col": bfd.TIMEDELTA_DTYPE, + } + + for col_name, expected_dtype in expected_bf_dtypes.items(): + assert nested_structs_df[col_name].dtype == expected_dtype, ( + f"Dtype mismatch for {col_name}" + ) + + # 2. Convert to pandas for value assertions + pd_df = nested_structs_df.to_pandas() + + # Verify we have 6 rows + assert len(pd_df) == 6 + + # Row 1: Normal typical values + assert pd_df.loc[1, "bool_col"] == True + assert pd_df.loc[1, "int64_col"] == 123456789 + assert pd_df.loc[1, "float64_col"] == 1.25 + assert pd_df.loc[1, "string_col"] == "Hello World" + assert pd_df.loc[1, "json_col"] == '{"a":1,"b":[1,2]}' + assert pd_df.loc[1, "date_col"] == dt.date(2026, 6, 24) + + # Row 2: Min bounds / negative infinity + assert pd_df.loc[2, "int64_col"] == -9223372036854775808 + assert pd_df.loc[2, "float64_col"] == float("-inf") + assert pd_df.loc[2, "numeric_col"] == decimal.Decimal( + "-99999999999999999999999999999.999999999" + ) + + # Row 3: Max bounds / infinity + assert pd_df.loc[3, "int64_col"] == 9223372036854775807 + assert pd_df.loc[3, "float64_col"] == float("inf") + + # Row 4: SQL NULLs (omitted keys) + assert pd.isna(pd_df.loc[4, "bool_col"]) + assert pd.isna(pd_df.loc[4, "int64_col"]) + assert pd.isna(pd_df.loc[4, "float64_col"]) + assert pd.isna(pd_df.loc[4, "json_col"]) + assert pd.isna(pd_df.loc[4, "geography_col"]) + + # Row 5: Special edge cases (NaN, empty, multiline) + assert np.isnan(pd_df.loc[5, "float64_col"]) + assert pd_df.loc[5, "float64_col"] is not pd.NA + assert not pd_df["float64_col"].isna().loc[5] + assert pd_df.loc[5, "string_col"] == 'Line 1\nLine 2\n"Quotes"' + assert pd_df.loc[5, "bytes_col"] == b"\x00" + + # Row 6: JSON null literal + assert pd_df.loc[6, "json_col"] == "null" + assert not pd_df["json_col"].isna().loc[6] + + def test_concat_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = bpd.concat( From a5fa941d692a193271f585b538a4d88a25105cc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 26 Jun 2026 09:52:24 -0500 Subject: [PATCH 2/7] Update packages/bigframes/tests/system/conftest.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- packages/bigframes/tests/system/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/bigframes/tests/system/conftest.py b/packages/bigframes/tests/system/conftest.py index 230a39a29f87..a50f4fa7940b 100644 --- a/packages/bigframes/tests/system/conftest.py +++ b/packages/bigframes/tests/system/conftest.py @@ -625,7 +625,7 @@ def get_val(row, col_name): # timestamp_col timestamp_vals = [ - datetime.datetime.fromisoformat(get_val(row, "timestamp_col")) + datetime.datetime.fromisoformat(get_val(row, "timestamp_col").replace("Z", "+00:00")) if get_val(row, "timestamp_col") is not None else None for row in raw_rows From a6a241fdbc6cc2be393854d149b5e6de539c1ecb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 26 Jun 2026 16:57:38 +0000 Subject: [PATCH 3/7] fix typing --- packages/bigframes/tests/system/conftest.py | 43 +++++++++------------ 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/packages/bigframes/tests/system/conftest.py b/packages/bigframes/tests/system/conftest.py index 230a39a29f87..51753022d608 100644 --- a/packages/bigframes/tests/system/conftest.py +++ b/packages/bigframes/tests/system/conftest.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import hashlib import logging import math @@ -19,7 +20,6 @@ import textwrap import traceback import typing -from datetime import datetime from typing import Dict, Generator, Optional import fsspec # type: ignore[import-untyped] @@ -501,12 +501,11 @@ def nested_structs_pandas_df(nested_structs_pandas_type: pd.ArrowDtype) -> pd.Da Manually parses using json.loads to preserve data types. """ import base64 - import datetime import decimal import json - import db_dtypes - import geopandas as gpd + import db_dtypes # type: ignore[import-untyped] + import geopandas as gpd # type: ignore[import-untyped] with open(DATA_DIR / "nested_structs.jsonl") as f: raw_rows = [json.loads(line) for line in f] @@ -518,7 +517,7 @@ def get_val(row, col_name): # person person_struct_schema = nested_structs_pandas_type.pyarrow_dtype - processed_person = [] + processed_person: list[Optional[dict[str, typing.Any]]] = [] for row in raw_rows: x = get_val(row, "person") if x is None: @@ -555,13 +554,7 @@ def get_val(row, col_name): for row in raw_rows ] arr = pa.array(float64_vals, type=pa.float64()) - mask = pa.compute.is_null(arr) - nonnull = pa.compute.fill_null(arr, float("nan")) - pd_array = pd.arrays.FloatingArray( - nonnull.to_numpy(zero_copy_only=False), - mask.to_numpy(zero_copy_only=False), - ) - float64_ser = pd.Series(pd_array, index=ids, dtype=pd.Float64Dtype()) + float64_ser = pd.Series(arr, index=ids, dtype=pd.Float64Dtype()) # string_col string_vals = [ @@ -575,7 +568,7 @@ def get_val(row, col_name): ) # json_col - json_strs = [] + json_strs: list[Optional[str]] = [] for row in raw_rows: if "json_col" not in row: json_strs.append(None) @@ -611,7 +604,7 @@ def get_val(row, col_name): time_ser = pd.Series(time_arr, index=ids, dtype=pd.ArrowDtype(pa.time64("us"))) # datetime_col - datetime_vals = [] + datetime_vals: list[Optional[datetime.datetime]] = [] for row in raw_rows: val = get_val(row, "datetime_col") if val is None: @@ -636,7 +629,7 @@ def get_val(row, col_name): ) # bytes_col - bytes_vals = [] + bytes_vals: list[Optional[bytes]] = [] for row in raw_rows: val = get_val(row, "bytes_col") if val is None: @@ -1041,9 +1034,9 @@ def new_time_series_pandas_df(): return pd.DataFrame( { "parsed_date": [ - datetime(2017, 8, 2, tzinfo=utc), - datetime(2017, 8, 3, tzinfo=utc), - datetime(2017, 8, 4, tzinfo=utc), + datetime.datetime(2017, 8, 2, tzinfo=utc), + datetime.datetime(2017, 8, 3, tzinfo=utc), + datetime.datetime(2017, 8, 4, tzinfo=utc), ], "total_visits": [2500, 2500, 2500], } @@ -1062,12 +1055,12 @@ def new_time_series_pandas_df_w_id(): return pd.DataFrame( { "parsed_date": [ - datetime(2017, 8, 2, tzinfo=utc), - datetime(2017, 8, 2, tzinfo=utc), - datetime(2017, 8, 3, tzinfo=utc), - datetime(2017, 8, 3, tzinfo=utc), - datetime(2017, 8, 4, tzinfo=utc), - datetime(2017, 8, 4, tzinfo=utc), + datetime.datetime(2017, 8, 2, tzinfo=utc), + datetime.datetime(2017, 8, 2, tzinfo=utc), + datetime.datetime(2017, 8, 3, tzinfo=utc), + datetime.datetime(2017, 8, 3, tzinfo=utc), + datetime.datetime(2017, 8, 4, tzinfo=utc), + datetime.datetime(2017, 8, 4, tzinfo=utc), ], "id": ["1", "2", "1", "2", "1", "2"], "total_visits": [2500, 2500, 2500, 2500, 2500, 2500], @@ -1680,7 +1673,7 @@ def cleanup_cloud_functions(session, cloudfunctions_client, dataset_id_permanent continue # Ignore the functions less than one day old - age = datetime.now() - datetime.fromtimestamp( + age = datetime.datetime.now() - datetime.datetime.fromtimestamp( cloud_function.update_time.timestamp() ) if age.days <= 0: From 4000b9f86408101cd9713f82ce171cac3f278ed5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 26 Jun 2026 16:58:52 +0000 Subject: [PATCH 4/7] fix lint --- packages/bigframes/tests/system/conftest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/bigframes/tests/system/conftest.py b/packages/bigframes/tests/system/conftest.py index 375864ba41be..37d56047513e 100644 --- a/packages/bigframes/tests/system/conftest.py +++ b/packages/bigframes/tests/system/conftest.py @@ -618,7 +618,9 @@ def get_val(row, col_name): # timestamp_col timestamp_vals = [ - datetime.datetime.fromisoformat(get_val(row, "timestamp_col").replace("Z", "+00:00")) + datetime.datetime.fromisoformat( + get_val(row, "timestamp_col").replace("Z", "+00:00") + ) if get_val(row, "timestamp_col") is not None else None for row in raw_rows From 9393164c041329f1f5c3a5d06dbba5b28c391d1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 26 Jun 2026 18:47:42 +0000 Subject: [PATCH 5/7] fix unit tests --- packages/bigframes/tests/unit/conftest.py | 11 ++ .../out.sql | 130 +++++++++++++++++- 2 files changed, 135 insertions(+), 6 deletions(-) diff --git a/packages/bigframes/tests/unit/conftest.py b/packages/bigframes/tests/unit/conftest.py index 3ab217cf09ba..34a580c9374d 100644 --- a/packages/bigframes/tests/unit/conftest.py +++ b/packages/bigframes/tests/unit/conftest.py @@ -178,6 +178,9 @@ def nested_structs_types_df(compiler_session_w_nested_structs_types) -> bpd.Data def nested_structs_pandas_df() -> pd.DataFrame: """Returns a pandas DataFrame containing STRUCT types and using the `id` column as the index.""" + import json + + import numpy as np df = pd.read_json( DATA_DIR / "nested_structs.jsonl", @@ -196,6 +199,14 @@ def nested_structs_pandas_df() -> pd.DataFrame: ] ) df["person"] = df["person"].astype(pd.ArrowDtype(person_struct_schema)) + + def to_json_str(val): + if val is None or (isinstance(val, float) and np.isnan(val)): + return None + return json.dumps(val) + + df["json_col"] = df["json_col"].apply(to_json_str).astype(dtypes.JSON_DTYPE) + return df diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql index 7ded9cf5fff7..46f362a708b9 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql @@ -1,27 +1,145 @@ WITH `bfcte_0` AS ( SELECT * - FROM UNNEST(ARRAY>, `bfcol_2` INT64>>[STRUCT( + FROM UNNEST(ARRAY>, `bfcol_2` FLOAT64, `bfcol_3` FLOAT64, `bfcol_4` FLOAT64, `bfcol_5` STRING, `bfcol_6` JSON, `bfcol_7` STRING, `bfcol_8` STRING, `bfcol_9` STRING, `bfcol_10` TIMESTAMP, `bfcol_11` STRING, `bfcol_12` FLOAT64, `bfcol_13` FLOAT64, `bfcol_14` STRING, `bfcol_15` FLOAT64, `bfcol_16` INT64>>[STRUCT( 1, STRUCT( 'Alice' AS `name`, 30 AS `age`, STRUCT('New York' AS `city`, 'USA' AS `country`) AS `address` ), + 1.0, + 123456789.0, + 1.25, + 'Hello World', + PARSE_JSON('{"a":1,"b":[1,2]}'), + '2026-06-24', + '12:34:56.789012', + '2026-06-24 12:34:56.789012', + CAST('2026-06-24T12:34:56.789012+00:00' AS TIMESTAMP), + 'SGVsbG8=', + 123456.789, + 123456.78901234567, + 'POINT(30 10)', + 1000.0, 0 ), STRUCT( 2, + STRUCT('' AS `name`, -1 AS `age`, STRUCT('' AS `city`, '' AS `country`) AS `address`), + 0.0, + -9.223372036854776e+18, + CAST('-Infinity' AS FLOAT64), + '', + PARSE_JSON('{}'), + '0001-01-01', + '00:00:00', + '0001-01-02 00:00:00', + CAST('0001-01-02T00:00:00+00:00' AS TIMESTAMP), + '', + -1e+29, + -1e+38, + 'POINT(0 0)', + -9223372036854776.0, + 1 + ), STRUCT( + 3, + STRUCT( + 'Very Long Name...' AS `name`, + 150 AS `age`, + STRUCT('City' AS `city`, 'Country' AS `country`) AS `address` + ), + 1.0, + 9.223372036854776e+18, + CAST('Infinity' AS FLOAT64), + 'Unicode: 🚀 Spark ✨', + PARSE_JSON('{"max":true,"nested":{"val":999}}'), + '9999-12-31', + '23:59:59.999999', + '9999-12-31 23:59:59.999999', + CAST('9999-12-31T23:59:59.999999+00:00' AS TIMESTAMP), + 'dmVyeSBsb25nIGJ5dGVzIHZhbHVl', + 1e+29, + 1e+38, + 'POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))', + 9223372036854776.0, + 2 + ), STRUCT( + 4, + CAST(NULL AS STRUCT>), + CAST(NULL AS FLOAT64), + CAST(NULL AS FLOAT64), + CAST(NULL AS FLOAT64), + CAST(NULL AS STRING), + CAST(NULL AS JSON), + CAST(NULL AS STRING), + CAST(NULL AS STRING), + CAST(NULL AS STRING), + CAST(NULL AS TIMESTAMP), + CAST(NULL AS STRING), + CAST(NULL AS FLOAT64), + CAST(NULL AS FLOAT64), + CAST(NULL AS STRING), + CAST(NULL AS FLOAT64), + 3 + ), STRUCT( + 5, STRUCT( 'Bob' AS `name`, - 25 AS `age`, - STRUCT('London' AS `city`, 'UK' AS `country`) AS `address` + 0 AS `age`, + CAST(NULL AS STRUCT) AS `address` ), - 1 + 0.0, + 0.0, + CAST(NULL AS FLOAT64), + 'Line 1\nLine 2\n"Quotes"', + PARSE_JSON('[1,"two",null]'), + '1970-01-01', + '12:00:00', + '1970-01-01 12:00:00', + CAST('1970-01-01T12:00:00+00:00' AS TIMESTAMP), + 'AA==', + 0.0, + 0.0, + 'LINESTRING(0 0, 1 1, 2 2)', + 0.0, + 4 + ), STRUCT( + 6, + CAST(NULL AS STRUCT>), + CAST(NULL AS FLOAT64), + CAST(NULL AS FLOAT64), + CAST(NULL AS FLOAT64), + CAST(NULL AS STRING), + CAST(NULL AS JSON), + CAST(NULL AS STRING), + CAST(NULL AS STRING), + CAST(NULL AS STRING), + CAST(NULL AS TIMESTAMP), + CAST(NULL AS STRING), + CAST(NULL AS FLOAT64), + CAST(NULL AS FLOAT64), + CAST(NULL AS STRING), + CAST(NULL AS FLOAT64), + 5 )]) ) SELECT `bfcol_0` AS `id`, - `bfcol_1` AS `person` + `bfcol_1` AS `person`, + `bfcol_2` AS `bool_col`, + `bfcol_3` AS `int64_col`, + `bfcol_4` AS `float64_col`, + `bfcol_5` AS `string_col`, + `bfcol_6` AS `json_col`, + `bfcol_7` AS `date_col`, + `bfcol_8` AS `time_col`, + `bfcol_9` AS `datetime_col`, + `bfcol_10` AS `timestamp_col`, + `bfcol_11` AS `bytes_col`, + `bfcol_12` AS `numeric_col`, + `bfcol_13` AS `bignumeric_col`, + `bfcol_14` AS `geography_col`, + `bfcol_15` AS `duration_col` FROM `bfcte_0` ORDER BY - `bfcol_2` ASC NULLS LAST \ No newline at end of file + `bfcol_16` ASC NULLS LAST From 89cadb624ff536e9bac9bc9627fb2f9e9af5cf8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 26 Jun 2026 18:52:35 +0000 Subject: [PATCH 6/7] fix snapshot --- .../test_compile_readlocal_w_structs_df/out.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql index 46f362a708b9..58a01635b7d2 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql @@ -142,4 +142,4 @@ SELECT `bfcol_15` AS `duration_col` FROM `bfcte_0` ORDER BY - `bfcol_16` ASC NULLS LAST + `bfcol_16` ASC NULLS LAST \ No newline at end of file From 38f4a5daf858efd66ce9dd44cb6f6edf24427305 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 26 Jun 2026 19:58:51 +0000 Subject: [PATCH 7/7] fix unit tests --- packages/bigframes/tests/system/conftest.py | 9 +++++++-- packages/bigframes/tests/unit/conftest.py | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/packages/bigframes/tests/system/conftest.py b/packages/bigframes/tests/system/conftest.py index 37d56047513e..2f68471d176b 100644 --- a/packages/bigframes/tests/system/conftest.py +++ b/packages/bigframes/tests/system/conftest.py @@ -34,6 +34,7 @@ import google.cloud.storage as storage # type: ignore import numpy as np import pandas as pd +import pandas.arrays import pyarrow as pa import pytest import pytz @@ -553,8 +554,12 @@ def get_val(row, col_name): else None for row in raw_rows ] - arr = pa.array(float64_vals, type=pa.float64()) - float64_ser = pd.Series(arr, index=ids, dtype=pd.Float64Dtype()) + np_vals = np.array( + [x if x is not None else np.nan for x in float64_vals], dtype=np.float64 + ) + mask = np.array([x is None for x in float64_vals], dtype=bool) + float64_arr = pd.arrays.FloatingArray(np_vals, mask) # type: ignore + float64_ser = pd.Series(float64_arr, index=ids) # string_col string_vals = [ diff --git a/packages/bigframes/tests/unit/conftest.py b/packages/bigframes/tests/unit/conftest.py index 34a580c9374d..5a266bcd413a 100644 --- a/packages/bigframes/tests/unit/conftest.py +++ b/packages/bigframes/tests/unit/conftest.py @@ -207,6 +207,24 @@ def to_json_str(val): df["json_col"] = df["json_col"].apply(to_json_str).astype(dtypes.JSON_DTYPE) + # timestamp_col + import datetime + + def parse_timestamp(val): + if pd.isna(val): + return None + if isinstance(val, str): + return datetime.datetime.fromisoformat(val.replace("Z", "+00:00")) + if hasattr(val, "to_pydatetime"): + return val.to_pydatetime() + return val + + timestamp_vals = [parse_timestamp(x) for x in df["timestamp_col"]] + timestamp_arr = pa.array(timestamp_vals, type=dtypes.TIMESTAMP_DTYPE.pyarrow_dtype) + df["timestamp_col"] = pd.Series( + timestamp_arr, index=df.index, dtype=dtypes.TIMESTAMP_DTYPE + ) + return df