Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions packages/bigframes/tests/data/nested_structs.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
{"id": 1, "person": {"name": "Alice", "age":30, "address": {"city": "New York", "country": "USA"}}}
{"id": 2, "person": {"name": "Bob", "age":25, "address": {"city": "London", "country": "UK"}}}
{"id": 1, "person": {"name": "Alice", "age": 30, "address": {"city": "New York", "country": "USA"}}, "bool_col": true, "int64_col": "123456789", "float64_col": 1.25, "string_col": "Hello World", "json_col": {"a": 1, "b": [1, 2]}, "date_col": "2026-06-24", "time_col": "12:34:56.789012", "datetime_col": "2026-06-24 12:34:56.789012", "timestamp_col": "2026-06-24T12:34:56.789012Z", "bytes_col": "SGVsbG8=", "numeric_col": "123456.789", "bignumeric_col": "123456.7890123456789", "geography_col": "POINT(30 10)", "duration_col": "1000"}
{"id": 2, "person": {"name": "", "age": -1, "address": {"city": "", "country": ""}}, "bool_col": false, "int64_col": "-9223372036854775808", "float64_col": "-Infinity", "string_col": "", "json_col": {}, "date_col": "0001-01-01", "time_col": "00:00:00", "datetime_col": "0001-01-02 00:00:00", "timestamp_col": "0001-01-02T00:00:00Z", "bytes_col": "", "numeric_col": "-99999999999999999999999999999.999999999", "bignumeric_col": "-99999999999999999999999999999999999999.99999999999999999999999999999999999999", "geography_col": "POINT(0 0)", "duration_col": "-9223372036854775"}
{"id": 3, "person": {"name": "Very Long Name...", "age": 150, "address": {"city": "City", "country": "Country"}}, "bool_col": true, "int64_col": "9223372036854775807", "float64_col": "Infinity", "string_col": "Unicode: 🚀 Spark ✨", "json_col": {"max": true, "nested": {"val": 999}}, "date_col": "9999-12-31", "time_col": "23:59:59.999999", "datetime_col": "9999-12-31 23:59:59.999999", "timestamp_col": "9999-12-31T23:59:59.999999Z", "bytes_col": "dmVyeSBsb25nIGJ5dGVzIHZhbHVl", "numeric_col": "99999999999999999999999999999.999999999", "bignumeric_col": "99999999999999999999999999999999999999.99999999999999999999999999999999999999", "geography_col": "POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))", "duration_col": "9223372036854775"}
{"id": 4, "person": null, "bool_col": null, "int64_col": null, "float64_col": null, "string_col": null, "date_col": null, "time_col": null, "datetime_col": null, "timestamp_col": null, "bytes_col": null, "numeric_col": null, "bignumeric_col": null, "geography_col": null, "duration_col": null}
{"id": 5, "person": {"name": "Bob", "age": 0, "address": null}, "bool_col": false, "int64_col": "0", "float64_col": "NaN", "string_col": "Line 1\nLine 2\n\"Quotes\"", "json_col": [1, "two", null], "date_col": "1970-01-01", "time_col": "12:00:00", "datetime_col": "1970-01-01 12:00:00", "timestamp_col": "1970-01-01T12:00:00Z", "bytes_col": "AA==", "numeric_col": "0", "bignumeric_col": "0", "geography_col": "LINESTRING(0 0, 1 1, 2 2)", "duration_col": "0"}
{"id": 6, "person": null, "bool_col": null, "int64_col": null, "float64_col": null, "string_col": null, "json_col": null, "date_col": null, "time_col": null, "datetime_col": null, "timestamp_col": null, "bytes_col": null, "numeric_col": null, "bignumeric_col": null, "geography_col": null, "duration_col": null}
73 changes: 73 additions & 0 deletions packages/bigframes/tests/data/nested_structs_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
{
"name": "person",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "name",
Expand All @@ -21,6 +22,7 @@
{
"name": "address",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "city",
Expand All @@ -35,5 +37,76 @@
]
}
]
},
{
"name": "bool_col",
"type": "BOOLEAN",
"mode": "NULLABLE"
},
{
"name": "int64_col",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "float64_col",
"type": "FLOAT",
"mode": "NULLABLE"
},
{
"name": "string_col",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "json_col",
"type": "JSON",
"mode": "NULLABLE"
},
{
"name": "date_col",
"type": "DATE",
"mode": "NULLABLE"
},
{
"name": "time_col",
"type": "TIME",
"mode": "NULLABLE"
},
{
"name": "datetime_col",
"type": "DATETIME",
"mode": "NULLABLE"
},
{
"name": "timestamp_col",
"type": "TIMESTAMP",
"mode": "NULLABLE"
},
{
"name": "bytes_col",
"type": "BYTES",
"mode": "NULLABLE"
},
{
"name": "numeric_col",
"type": "NUMERIC",
"mode": "NULLABLE"
},
{
"name": "bignumeric_col",
"type": "BIGNUMERIC",
"mode": "NULLABLE"
},
{
"name": "geography_col",
"type": "GEOGRAPHY",
"mode": "NULLABLE"
},
{
"name": "duration_col",
"type": "INTEGER",
"mode": "NULLABLE",
"description": "#microseconds"
}
]
219 changes: 213 additions & 6 deletions packages/bigframes/tests/system/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,14 +496,221 @@ def nested_structs_df(

@pytest.fixture(scope="session")
def nested_structs_pandas_df(nested_structs_pandas_type: pd.ArrowDtype) -> pd.DataFrame:
"""pd.DataFrame pointing at test data."""
"""pd.DataFrame pointing at test data.

Manually parses using json.loads to preserve data types.
"""
import base64
import datetime
import decimal
import json

import db_dtypes
import geopandas as gpd

with open(DATA_DIR / "nested_structs.jsonl") as f:
raw_rows = [json.loads(line) for line in f]

ids = [row["id"] for row in raw_rows]

def get_val(row, col_name):
return row.get(col_name)

# person
person_struct_schema = nested_structs_pandas_type.pyarrow_dtype
processed_person = []
for row in raw_rows:
x = get_val(row, "person")
if x is None:
processed_person.append(None)
else:
d = dict(x)
if "age" in d and d["age"] is not None:
d["age"] = int(d["age"])
processed_person.append(d)
person_arr = pa.array(processed_person, type=person_struct_schema)
person_ser = pd.Series(person_arr, index=ids, dtype=nested_structs_pandas_type)

# bool_col
bool_vals = [
bool(get_val(row, "bool_col")) if get_val(row, "bool_col") is not None else None
for row in raw_rows
]
bool_ser = pd.Series(bool_vals, index=ids, dtype=pd.BooleanDtype())

# int64_col
int64_vals = [
int(get_val(row, "int64_col"))
if get_val(row, "int64_col") is not None
else None
for row in raw_rows
]
int64_ser = pd.Series(int64_vals, index=ids, dtype=pd.Int64Dtype())

# float64_col
float64_vals = [
float(get_val(row, "float64_col"))
if get_val(row, "float64_col") is not None
else None
for row in raw_rows
]
arr = pa.array(float64_vals, type=pa.float64())
mask = pa.compute.is_null(arr)
nonnull = pa.compute.fill_null(arr, float("nan"))
pd_array = pd.arrays.FloatingArray(
nonnull.to_numpy(zero_copy_only=False),
mask.to_numpy(zero_copy_only=False),
)
float64_ser = pd.Series(pd_array, index=ids, dtype=pd.Float64Dtype())

# string_col
string_vals = [
str(get_val(row, "string_col"))
if get_val(row, "string_col") is not None
else None
for row in raw_rows
]
string_ser = pd.Series(
string_vals, index=ids, dtype=pd.StringDtype(storage="pyarrow")
)

df = pd.read_json(
DATA_DIR / "nested_structs.jsonl",
lines=True,
# json_col
json_strs = []
for row in raw_rows:
if "json_col" not in row:
json_strs.append(None)
elif row["json_col"] is None:
json_strs.append("null")
else:
json_strs.append(
json.dumps(row["json_col"], sort_keys=True, separators=(",", ":"))
)
json_arr = pa.array(json_strs, type=db_dtypes.JSONArrowType())
json_ser = pd.Series(
json_arr, index=ids, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType())
)

# date_col
date_vals = [
datetime.date.fromisoformat(get_val(row, "date_col"))
if get_val(row, "date_col") is not None
else None
for row in raw_rows
]
date_arr = pa.array(date_vals, type=pa.date32())
date_ser = pd.Series(date_arr, index=ids, dtype=pd.ArrowDtype(pa.date32()))

# time_col
time_vals = [
datetime.time.fromisoformat(get_val(row, "time_col"))
if get_val(row, "time_col") is not None
else None
for row in raw_rows
]
time_arr = pa.array(time_vals, type=pa.time64("us"))
time_ser = pd.Series(time_arr, index=ids, dtype=pd.ArrowDtype(pa.time64("us")))

# datetime_col
datetime_vals = []
for row in raw_rows:
val = get_val(row, "datetime_col")
if val is None:
datetime_vals.append(None)
else:
datetime_vals.append(datetime.datetime.fromisoformat(val.replace(" ", "T")))
datetime_arr = pa.array(datetime_vals, type=pa.timestamp("us"))
datetime_ser = pd.Series(
datetime_arr, index=ids, dtype=pd.ArrowDtype(pa.timestamp("us"))
)

# timestamp_col
timestamp_vals = [
datetime.datetime.fromisoformat(get_val(row, "timestamp_col"))
if get_val(row, "timestamp_col") is not None
else None
for row in raw_rows
]
Comment on lines +627 to +632

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

In Python 3.10 and below, datetime.datetime.fromisoformat does not support the trailing Z suffix (which was introduced in Python 3.11). Since this library supports Python 3.9 and 3.10, parsing timestamp_col values ending in Z will raise a ValueError. Replacing Z with +00:00 ensures backward compatibility across all supported Python versions.

Suggested change
timestamp_vals = [
datetime.datetime.fromisoformat(get_val(row, "timestamp_col"))
if get_val(row, "timestamp_col") is not None
else None
for row in raw_rows
]
timestamp_vals = [
datetime.datetime.fromisoformat(get_val(row, "timestamp_col").replace("Z", "+00:00"))
if get_val(row, "timestamp_col") is not None
else None
for row in raw_rows
]

timestamp_arr = pa.array(timestamp_vals, type=pa.timestamp("us", tz="UTC"))
timestamp_ser = pd.Series(
timestamp_arr, index=ids, dtype=pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
)

# bytes_col
bytes_vals = []
for row in raw_rows:
val = get_val(row, "bytes_col")
if val is None:
bytes_vals.append(None)
elif val == "":
bytes_vals.append(b"")
else:
bytes_vals.append(base64.b64decode(val))
bytes_arr = pa.array(bytes_vals, type=pa.binary())
bytes_ser = pd.Series(bytes_arr, index=ids, dtype=pd.ArrowDtype(pa.binary()))

# numeric_col
numeric_vals = [
decimal.Decimal(str(get_val(row, "numeric_col")))
if get_val(row, "numeric_col") is not None
else None
for row in raw_rows
]
numeric_arr = pa.array(numeric_vals, type=pa.decimal128(38, 9))
numeric_ser = pd.Series(
numeric_arr, index=ids, dtype=pd.ArrowDtype(pa.decimal128(38, 9))
)

# bignumeric_col
bignumeric_vals = [
decimal.Decimal(str(get_val(row, "bignumeric_col")))
if get_val(row, "bignumeric_col") is not None
else None
for row in raw_rows
]
bignumeric_arr = pa.array(bignumeric_vals, type=pa.decimal256(76, 38))
bignumeric_ser = pd.Series(
bignumeric_arr, index=ids, dtype=pd.ArrowDtype(pa.decimal256(76, 38))
)

# geography_col
geo_vals = [get_val(row, "geography_col") for row in raw_rows]
geo_ser = gpd.GeoSeries.from_wkt(geo_vals)
geo_ser.index = ids

# duration_col
duration_vals = [
int(get_val(row, "duration_col"))
if get_val(row, "duration_col") is not None
else None
for row in raw_rows
]
duration_arr = pa.array(duration_vals, type=pa.duration("us"))
duration_ser = pd.Series(
duration_arr, index=ids, dtype=pd.ArrowDtype(pa.duration("us"))
)

df = pd.DataFrame(
{
"person": person_ser,
"bool_col": bool_ser,
"int64_col": int64_ser,
"float64_col": float64_ser,
"string_col": string_ser,
"json_col": json_ser,
"date_col": date_ser,
"time_col": time_ser,
"datetime_col": datetime_ser,
"timestamp_col": timestamp_ser,
"bytes_col": bytes_ser,
"numeric_col": numeric_ser,
"bignumeric_col": bignumeric_ser,
"geography_col": geo_ser,
"duration_col": duration_ser,
},
index=ids,
)
df = df.set_index("id")
df["person"] = df["person"].astype(nested_structs_pandas_type)
df.index.name = "id"

return df


Expand Down
Loading
Loading