googleapis · tswast · Jun 24, 2026 · gemini-code-assist · Jun 24, 2026
@@ -1,2 +1,6 @@
-{"id": 1,   "person": {"name": "Alice", "age":30,   "address": {"city": "New York", "country": "USA"}}}
-{"id": 2,   "person": {"name": "Bob",   "age":25,   "address": {"city": "London",   "country": "UK"}}}
+{"id": 1, "person": {"name": "Alice", "age": 30, "address": {"city": "New York", "country": "USA"}}, "bool_col": true, "int64_col": "123456789", "float64_col": 1.25, "string_col": "Hello World", "json_col": {"a": 1, "b": [1, 2]}, "date_col": "2026-06-24", "time_col": "12:34:56.789012", "datetime_col": "2026-06-24 12:34:56.789012", "timestamp_col": "2026-06-24T12:34:56.789012Z", "bytes_col": "SGVsbG8=", "numeric_col": "123456.789", "bignumeric_col": "123456.7890123456789", "geography_col": "POINT(30 10)", "duration_col": "1000"}
+{"id": 2, "person": {"name": "", "age": -1, "address": {"city": "", "country": ""}}, "bool_col": false, "int64_col": "-9223372036854775808", "float64_col": "-Infinity", "string_col": "", "json_col": {}, "date_col": "0001-01-01", "time_col": "00:00:00", "datetime_col": "0001-01-02 00:00:00", "timestamp_col": "0001-01-02T00:00:00Z", "bytes_col": "", "numeric_col": "-99999999999999999999999999999.999999999", "bignumeric_col": "-99999999999999999999999999999999999999.99999999999999999999999999999999999999", "geography_col": "POINT(0 0)", "duration_col": "-9223372036854775"}
+{"id": 3, "person": {"name": "Very Long Name...", "age": 150, "address": {"city": "City", "country": "Country"}}, "bool_col": true, "int64_col": "9223372036854775807", "float64_col": "Infinity", "string_col": "Unicode: 🚀 Spark ✨", "json_col": {"max": true, "nested": {"val": 999}}, "date_col": "9999-12-31", "time_col": "23:59:59.999999", "datetime_col": "9999-12-31 23:59:59.999999", "timestamp_col": "9999-12-31T23:59:59.999999Z", "bytes_col": "dmVyeSBsb25nIGJ5dGVzIHZhbHVl", "numeric_col": "99999999999999999999999999999.999999999", "bignumeric_col": "99999999999999999999999999999999999999.99999999999999999999999999999999999999", "geography_col": "POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))", "duration_col": "9223372036854775"}
+{"id": 4, "person": null, "bool_col": null, "int64_col": null, "float64_col": null, "string_col": null, "date_col": null, "time_col": null, "datetime_col": null, "timestamp_col": null, "bytes_col": null, "numeric_col": null, "bignumeric_col": null, "geography_col": null, "duration_col": null}
+{"id": 5, "person": {"name": "Bob", "age": 0, "address": null}, "bool_col": false, "int64_col": "0", "float64_col": "NaN", "string_col": "Line 1\nLine 2\n\"Quotes\"", "json_col": [1, "two", null], "date_col": "1970-01-01", "time_col": "12:00:00", "datetime_col": "1970-01-01 12:00:00", "timestamp_col": "1970-01-01T12:00:00Z", "bytes_col": "AA==", "numeric_col": "0", "bignumeric_col": "0", "geography_col": "LINESTRING(0 0, 1 1, 2 2)", "duration_col": "0"}
+{"id": 6, "person": null, "bool_col": null, "int64_col": null, "float64_col": null, "string_col": null, "json_col": null, "date_col": null, "time_col": null, "datetime_col": null, "timestamp_col": null, "bytes_col": null, "numeric_col": null, "bignumeric_col": null, "geography_col": null, "duration_col": null}
@@ -7,6 +7,7 @@
     {
         "name": "person",
         "type": "RECORD",
+        "mode": "NULLABLE",
         "fields": [
             {
                 "name": "name",
@@ -21,6 +22,7 @@
             {
                 "name": "address",
                 "type": "RECORD",
+                "mode": "NULLABLE",
                 "fields": [
                     {
                         "name": "city",
@@ -35,5 +37,76 @@
                 ]
             }
         ]
+    },
+    {
+        "name": "bool_col",
+        "type": "BOOLEAN",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "int64_col",
+        "type": "INTEGER",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "float64_col",
+        "type": "FLOAT",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "string_col",
+        "type": "STRING",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "json_col",
+        "type": "JSON",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "date_col",
+        "type": "DATE",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "time_col",
+        "type": "TIME",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "datetime_col",
+        "type": "DATETIME",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "timestamp_col",
+        "type": "TIMESTAMP",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "bytes_col",
+        "type": "BYTES",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "numeric_col",
+        "type": "NUMERIC",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "bignumeric_col",
+        "type": "BIGNUMERIC",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "geography_col",
+        "type": "GEOGRAPHY",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "duration_col",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": "#microseconds"
     }
 ]
@@ -496,14 +496,221 @@ def nested_structs_df(
 
 @pytest.fixture(scope="session")
 def nested_structs_pandas_df(nested_structs_pandas_type: pd.ArrowDtype) -> pd.DataFrame:
-    """pd.DataFrame pointing at test data."""
+    """pd.DataFrame pointing at test data.
+
+    Manually parses using json.loads to preserve data types.
+    """
+    import base64
+    import datetime
+    import decimal
+    import json
+
+    import db_dtypes
+    import geopandas as gpd
+
+    with open(DATA_DIR / "nested_structs.jsonl") as f:
+        raw_rows = [json.loads(line) for line in f]
+
+    ids = [row["id"] for row in raw_rows]
+
+    def get_val(row, col_name):
+        return row.get(col_name)
+
+    # person
+    person_struct_schema = nested_structs_pandas_type.pyarrow_dtype
+    processed_person = []
+    for row in raw_rows:
+        x = get_val(row, "person")
+        if x is None:
+            processed_person.append(None)
+        else:
+            d = dict(x)
+            if "age" in d and d["age"] is not None:
+                d["age"] = int(d["age"])
+            processed_person.append(d)
+    person_arr = pa.array(processed_person, type=person_struct_schema)
+    person_ser = pd.Series(person_arr, index=ids, dtype=nested_structs_pandas_type)
+
+    # bool_col
+    bool_vals = [
+        bool(get_val(row, "bool_col")) if get_val(row, "bool_col") is not None else None
+        for row in raw_rows
+    ]
+    bool_ser = pd.Series(bool_vals, index=ids, dtype=pd.BooleanDtype())
+
+    # int64_col
+    int64_vals = [
+        int(get_val(row, "int64_col"))
+        if get_val(row, "int64_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    int64_ser = pd.Series(int64_vals, index=ids, dtype=pd.Int64Dtype())
+
+    # float64_col
+    float64_vals = [
+        float(get_val(row, "float64_col"))
+        if get_val(row, "float64_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    arr = pa.array(float64_vals, type=pa.float64())
+    mask = pa.compute.is_null(arr)
+    nonnull = pa.compute.fill_null(arr, float("nan"))
+    pd_array = pd.arrays.FloatingArray(
+        nonnull.to_numpy(zero_copy_only=False),
+        mask.to_numpy(zero_copy_only=False),
+    )
+    float64_ser = pd.Series(pd_array, index=ids, dtype=pd.Float64Dtype())
+
+    # string_col
+    string_vals = [
+        str(get_val(row, "string_col"))
+        if get_val(row, "string_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    string_ser = pd.Series(
+        string_vals, index=ids, dtype=pd.StringDtype(storage="pyarrow")
+    )
 
-    df = pd.read_json(
-        DATA_DIR / "nested_structs.jsonl",
-        lines=True,
+    # json_col
+    json_strs = []
+    for row in raw_rows:
+        if "json_col" not in row:
+            json_strs.append(None)
+        elif row["json_col"] is None:
+            json_strs.append("null")
+        else:
+            json_strs.append(
+                json.dumps(row["json_col"], sort_keys=True, separators=(",", ":"))
+            )
+    json_arr = pa.array(json_strs, type=db_dtypes.JSONArrowType())
+    json_ser = pd.Series(
+        json_arr, index=ids, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType())
+    )
+
+    # date_col
+    date_vals = [
+        datetime.date.fromisoformat(get_val(row, "date_col"))
+        if get_val(row, "date_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    date_arr = pa.array(date_vals, type=pa.date32())
+    date_ser = pd.Series(date_arr, index=ids, dtype=pd.ArrowDtype(pa.date32()))
+
+    # time_col
+    time_vals = [
+        datetime.time.fromisoformat(get_val(row, "time_col"))
+        if get_val(row, "time_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    time_arr = pa.array(time_vals, type=pa.time64("us"))
+    time_ser = pd.Series(time_arr, index=ids, dtype=pd.ArrowDtype(pa.time64("us")))
+
+    # datetime_col
+    datetime_vals = []
+    for row in raw_rows:
+        val = get_val(row, "datetime_col")
+        if val is None:
+            datetime_vals.append(None)
+        else:
+            datetime_vals.append(datetime.datetime.fromisoformat(val.replace(" ", "T")))
+    datetime_arr = pa.array(datetime_vals, type=pa.timestamp("us"))
+    datetime_ser = pd.Series(
+        datetime_arr, index=ids, dtype=pd.ArrowDtype(pa.timestamp("us"))
+    )
+
+    # timestamp_col
+    timestamp_vals = [
+        datetime.datetime.fromisoformat(get_val(row, "timestamp_col"))
+        if get_val(row, "timestamp_col") is not None
+        else None
+        for row in raw_rows
+    ]
-    timestamp_vals = [
-        datetime.datetime.fromisoformat(get_val(row, "timestamp_col"))
-        if get_val(row, "timestamp_col") is not None
-        else None
-        for row in raw_rows
-    ]
+    timestamp_vals = [
+        datetime.datetime.fromisoformat(get_val(row, "timestamp_col").replace("Z", "+00:00"))
+        if get_val(row, "timestamp_col") is not None
+        else None
+        for row in raw_rows
+    ]
-    timestamp_vals = [
-        datetime.datetime.fromisoformat(get_val(row, "timestamp_col"))
-        if get_val(row, "timestamp_col") is not None
-        else None
-        for row in raw_rows
-    ]
+    timestamp_vals = [
+        datetime.datetime.fromisoformat(get_val(row, "timestamp_col").replace("Z", "+00:00"))
+        if get_val(row, "timestamp_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    timestamp_arr = pa.array(timestamp_vals, type=pa.timestamp("us", tz="UTC"))
+    timestamp_ser = pd.Series(
+        timestamp_arr, index=ids, dtype=pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
+    )
+
+    # bytes_col
+    bytes_vals = []
+    for row in raw_rows:
+        val = get_val(row, "bytes_col")
+        if val is None:
+            bytes_vals.append(None)
+        elif val == "":
+            bytes_vals.append(b"")
+        else:
+            bytes_vals.append(base64.b64decode(val))
+    bytes_arr = pa.array(bytes_vals, type=pa.binary())
+    bytes_ser = pd.Series(bytes_arr, index=ids, dtype=pd.ArrowDtype(pa.binary()))
+
+    # numeric_col
+    numeric_vals = [
+        decimal.Decimal(str(get_val(row, "numeric_col")))
+        if get_val(row, "numeric_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    numeric_arr = pa.array(numeric_vals, type=pa.decimal128(38, 9))
+    numeric_ser = pd.Series(
+        numeric_arr, index=ids, dtype=pd.ArrowDtype(pa.decimal128(38, 9))
+    )
+
+    # bignumeric_col
+    bignumeric_vals = [
+        decimal.Decimal(str(get_val(row, "bignumeric_col")))
+        if get_val(row, "bignumeric_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    bignumeric_arr = pa.array(bignumeric_vals, type=pa.decimal256(76, 38))
+    bignumeric_ser = pd.Series(
+        bignumeric_arr, index=ids, dtype=pd.ArrowDtype(pa.decimal256(76, 38))
+    )
+
+    # geography_col
+    geo_vals = [get_val(row, "geography_col") for row in raw_rows]
+    geo_ser = gpd.GeoSeries.from_wkt(geo_vals)
+    geo_ser.index = ids
+
+    # duration_col
+    duration_vals = [
+        int(get_val(row, "duration_col"))
+        if get_val(row, "duration_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    duration_arr = pa.array(duration_vals, type=pa.duration("us"))
+    duration_ser = pd.Series(
+        duration_arr, index=ids, dtype=pd.ArrowDtype(pa.duration("us"))
+    )
+
+    df = pd.DataFrame(
+        {
+            "person": person_ser,
+            "bool_col": bool_ser,
+            "int64_col": int64_ser,
+            "float64_col": float64_ser,
+            "string_col": string_ser,
+            "json_col": json_ser,
+            "date_col": date_ser,
+            "time_col": time_ser,
+            "datetime_col": datetime_ser,
+            "timestamp_col": timestamp_ser,
+            "bytes_col": bytes_ser,
+            "numeric_col": numeric_ser,
+            "bignumeric_col": bignumeric_ser,
+            "geography_col": geo_ser,
+            "duration_col": duration_ser,
+        },
+        index=ids,
     )
-    df = df.set_index("id")
-    df["person"] = df["person"].astype(nested_structs_pandas_type)
+    df.index.name = "id"
+
     return df