Skip to content

Commit 6965558

Browse files
committed
fix(bigquery): write pandas datetime[ns] columns to BigQuery TIMESTAMP columns
Also: * Enable TIMESTAMP and DATETIME unit tests for `_pandas_helpers`. * Add more data types to load dataframe sample.
1 parent 154c8ec commit 6965558

File tree

4 files changed

+173
-38
lines changed

4 files changed

+173
-38
lines changed

bigquery/google/cloud/bigquery/_pandas_helpers.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,11 @@
5252
_PANDAS_DTYPE_TO_BQ = {
5353
"bool": "BOOLEAN",
5454
"datetime64[ns, UTC]": "TIMESTAMP",
55-
"datetime64[ns]": "DATETIME",
55+
# Due to internal bug 147108331, BigQuery always interprets DATETIME
56+
# columns as having the wrong precision. In the meantime, workaround this
57+
# by writing the values as TIMESTAMP. See:
58+
# https://github.com/googleapis/google-cloud-python/issues/9996
59+
"datetime64[ns]": "TIMESTAMP",
5660
"float32": "FLOAT",
5761
"float64": "FLOAT",
5862
"int8": "INTEGER",
@@ -218,7 +222,7 @@ def bq_to_arrow_array(series, bq_field):
218222
return pyarrow.ListArray.from_pandas(series, type=arrow_type)
219223
if field_type_upper in schema._STRUCT_TYPES:
220224
return pyarrow.StructArray.from_pandas(series, type=arrow_type)
221-
return pyarrow.array(series, type=arrow_type)
225+
return pyarrow.Array.from_pandas(series, type=arrow_type)
222226

223227

224228
def get_column_or_index(dataframe, name):

bigquery/samples/load_table_dataframe.py

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@
1616
def load_table_dataframe(client, table_id):
1717

1818
# [START bigquery_load_table_dataframe]
19-
from google.cloud import bigquery
19+
import datetime
2020

21+
from google.cloud import bigquery
2122
import pandas
23+
import pytz
2224

2325
# TODO(developer): Construct a BigQuery client object.
2426
# client = bigquery.Client()
@@ -27,16 +29,54 @@ def load_table_dataframe(client, table_id):
2729
# table_id = "your-project.your_dataset.your_table_name"
2830

2931
records = [
30-
{"title": u"The Meaning of Life", "release_year": 1983},
31-
{"title": u"Monty Python and the Holy Grail", "release_year": 1975},
32-
{"title": u"Life of Brian", "release_year": 1979},
33-
{"title": u"And Now for Something Completely Different", "release_year": 1971},
32+
{
33+
"title": u"The Meaning of Life",
34+
"release_year": 1983,
35+
"length_minutes": 112.5,
36+
"release_date": datetime.datetime(
37+
1983, 5, 9, 13, 0, 0, tzinfo=pytz.timezone("Europe/Paris")
38+
),
39+
"dvd_release": datetime.datetime(2002, 1, 22, 7, 0, 0),
40+
},
41+
{
42+
"title": u"Monty Python and the Holy Grail",
43+
"release_year": 1975,
44+
"length_minutes": 91.5,
45+
"release_date": datetime.datetime(
46+
1975, 4, 9, 23, 59, 2, tzinfo=pytz.timezone("Europe/London")
47+
),
48+
"dvd_release": datetime.datetime(2002, 7, 16, 9, 0, 0),
49+
},
50+
{
51+
"title": u"Life of Brian",
52+
"release_year": 1979,
53+
"length_minutes": 94.25,
54+
"release_date": datetime.datetime(
55+
1979, 8, 17, 23, 59, 5, tzinfo=pytz.timezone("America/New_York")
56+
),
57+
"dvd_release": datetime.datetime(2008, 1, 14, 8, 0, 0),
58+
},
59+
{
60+
"title": u"And Now for Something Completely Different",
61+
"release_year": 1971,
62+
"length_minutes": 88.0,
63+
"release_date": datetime.datetime(
64+
1971, 9, 28, 23, 59, 7, tzinfo=pytz.timezone("Europe/London")
65+
),
66+
"dvd_release": datetime.datetime(2003, 10, 22, 10, 0, 0),
67+
},
3468
]
3569
dataframe = pandas.DataFrame(
3670
records,
3771
# In the loaded table, the column order reflects the order of the
3872
# columns in the DataFrame.
39-
columns=["title", "release_year"],
73+
columns=[
74+
"title",
75+
"release_year",
76+
"length_minutes",
77+
"release_date",
78+
"dvd_release",
79+
],
4080
# Optionally, set a named index, which can also be written to the
4181
# BigQuery table.
4282
index=pandas.Index(

bigquery/samples/tests/test_load_table_dataframe.py

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import datetime
16+
1517
import pytest
18+
import pytz
1619

1720
from .. import load_table_dataframe
1821

@@ -25,7 +28,67 @@ def test_load_table_dataframe(capsys, client, random_table_id):
2528

2629
table = load_table_dataframe.load_table_dataframe(client, random_table_id)
2730
out, _ = capsys.readouterr()
28-
assert "Loaded 4 rows and 3 columns" in out
31+
expected_column_names = [
32+
"wikidata_id",
33+
"title",
34+
"release_year",
35+
"length_minutes",
36+
"release_date",
37+
"dvd_release",
38+
]
39+
assert "Loaded 4 rows and {} columns".format(len(expected_column_names)) in out
2940

3041
column_names = [field.name for field in table.schema]
31-
assert column_names == ["wikidata_id", "title", "release_year"]
42+
assert column_names == expected_column_names
43+
column_types = [field.field_type for field in table.schema]
44+
assert column_types == [
45+
"STRING",
46+
"STRING",
47+
"INTEGER",
48+
"FLOAT",
49+
"TIMESTAMP",
50+
# Due to internal bug 147108331, BigQuery always interprets DATETIME
51+
# columns as having the wrong precision. In the meantime, workaround this
52+
# by writing the values as TIMESTAMP. See:
53+
# https://github.com/googleapis/google-cloud-python/issues/9996
54+
"TIMESTAMP",
55+
]
56+
57+
df = client.list_rows(table).to_dataframe()
58+
df.sort_values("release_year", inplace=True)
59+
assert df["title"].tolist() == [
60+
u"And Now for Something Completely Different",
61+
u"Monty Python and the Holy Grail",
62+
u"Life of Brian",
63+
u"The Meaning of Life",
64+
]
65+
assert df["release_year"].tolist() == [
66+
1971,
67+
1975,
68+
1979,
69+
1983,
70+
]
71+
assert df["length_minutes"].tolist() == [
72+
88.0,
73+
91.5,
74+
94.25,
75+
112.5,
76+
]
77+
assert df["release_date"].tolist() == [
78+
pandas.Timestamp("1971-09-28T23:59:07+00:00"),
79+
pandas.Timestamp("1975-04-09T23:59:02+00:00"),
80+
pandas.Timestamp("1979-08-17T23:59:05+00:00"),
81+
pandas.Timestamp("1983-05-09T13:00:00+00:00"),
82+
]
83+
assert df["dvd_release"].tolist() == [
84+
pandas.Timestamp("2003-10-22T10:00:00+00:00"),
85+
pandas.Timestamp("2002-07-16T09:00:00+00:00"),
86+
pandas.Timestamp("2008-01-14T08:00:00+00:00"),
87+
pandas.Timestamp("2002-01-22T07:00:00+00:00"),
88+
]
89+
assert df["wikidata_id"].tolist() == [
90+
u"Q16403",
91+
u"Q25043",
92+
u"Q24953",
93+
u"Q24980",
94+
]

bigquery/tests/unit/test__pandas_helpers.py

Lines changed: 56 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ def test_is_datetime():
9292
assert is_datetime(pyarrow.timestamp("us", tz=None))
9393
assert not is_datetime(pyarrow.timestamp("ms", tz=None))
9494
assert not is_datetime(pyarrow.timestamp("us", tz="UTC"))
95+
assert not is_datetime(pyarrow.timestamp("ns", tz="UTC"))
9596
assert not is_datetime(pyarrow.string())
9697

9798

@@ -386,20 +387,15 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test):
386387
),
387388
("BOOLEAN", [True, None, False, None]),
388389
("BOOL", [False, None, True, None]),
389-
# TODO: Once https://issues.apache.org/jira/browse/ARROW-5450 is
390-
# resolved, test with TIMESTAMP column. Conversion from pyarrow
391-
# TimestampArray to list of Python objects fails with OverflowError:
392-
# Python int too large to convert to C long.
393-
#
394-
# (
395-
# "TIMESTAMP",
396-
# [
397-
# datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
398-
# None,
399-
# datetime.datetime(9999, 12, 31, 23, 59, 59, 999999, tzinfo=pytz.utc),
400-
# datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
401-
# ],
402-
# ),
390+
(
391+
"TIMESTAMP",
392+
[
393+
datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
394+
None,
395+
datetime.datetime(9999, 12, 31, 23, 59, 59, 999999, tzinfo=pytz.utc),
396+
datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
397+
],
398+
),
403399
(
404400
"DATE",
405401
[
@@ -418,20 +414,16 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test):
418414
datetime.time(12, 0, 0),
419415
],
420416
),
421-
# TODO: Once https://issues.apache.org/jira/browse/ARROW-5450 is
422-
# resolved, test with DATETIME column. Conversion from pyarrow
423-
# TimestampArray to list of Python objects fails with OverflowError:
424-
# Python int too large to convert to C long.
425-
#
426-
# (
427-
# "DATETIME",
428-
# [
429-
# datetime.datetime(1, 1, 1, 0, 0, 0),
430-
# None,
431-
# datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
432-
# datetime.datetime(1970, 1, 1, 0, 0, 0),
433-
# ],
434-
# ),
417+
(
418+
"DATETIME",
419+
[
420+
datetime.datetime(1, 1, 1, 0, 0, 0),
421+
datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
422+
None,
423+
datetime.datetime(1970, 1, 1, 0, 0, 0),
424+
datetime.datetime(1999, 3, 14, 15, 9, 26, 535898),
425+
],
426+
),
435427
(
436428
"GEOGRAPHY",
437429
[
@@ -453,6 +445,42 @@ def test_bq_to_arrow_array_w_nullable_scalars(module_under_test, bq_type, rows):
453445
assert rows == roundtrip
454446

455447

448+
@pytest.mark.parametrize(
449+
"bq_type,rows",
450+
[
451+
(
452+
"TIMESTAMP",
453+
[
454+
"1971-09-28T23:59:07+00:00",
455+
"1975-04-09T23:59:02+00:00",
456+
"1979-08-17T23:59:05+00:00",
457+
"NaT",
458+
"1983-05-09T13:00:00+00:00",
459+
],
460+
),
461+
(
462+
"DATETIME",
463+
[
464+
"1971-09-28T23:59:07",
465+
"1975-04-09T23:59:02",
466+
"1979-08-17T23:59:05",
467+
"NaT",
468+
"1983-05-09T13:00:00",
469+
],
470+
),
471+
],
472+
)
473+
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
474+
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
475+
def test_bq_to_arrow_array_w_pandas_timestamp(module_under_test, bq_type, rows):
476+
rows = [pandas.Timestamp(row) for row in rows]
477+
series = pandas.Series(rows)
478+
bq_field = schema.SchemaField("field_name", bq_type)
479+
arrow_array = module_under_test.bq_to_arrow_array(series, bq_field)
480+
roundtrip = arrow_array.to_pandas()
481+
assert series.equals(roundtrip)
482+
483+
456484
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
457485
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
458486
def test_bq_to_arrow_array_w_arrays(module_under_test):

0 commit comments

Comments
 (0)