Skip to content

Commit

Permalink
fix issue with missing value in column labels
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche committed Dec 11, 2024
1 parent 4c81add commit 762b554
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 12 deletions.
14 changes: 9 additions & 5 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,11 @@ def get_column_metadata(column, name, arrow_type, field_name):
}
string_dtype = 'object'

if name is not None and not isinstance(name, str):
if (
name is not None
and not (isinstance(name, float) and np.isnan(name))
and not isinstance(name, str)
):
raise TypeError(
'Column name must be a string. Got column {} of type {}'.format(
name, type(name).__name__
Expand Down Expand Up @@ -340,8 +344,8 @@ def _column_name_to_strings(name):
return str(tuple(map(_column_name_to_strings, name)))
elif isinstance(name, Sequence):
raise TypeError("Unsupported type for MultiIndex level")
elif name is None:
return None
elif name is None or (isinstance(name, float) and np.isnan(name)):
return name
return str(name)


Expand Down Expand Up @@ -1077,9 +1081,9 @@ def get_pandas_logical_type_map():
'date': 'datetime64[D]',
'datetime': 'datetime64[ns]',
'datetimetz': 'datetime64[ns]',
'unicode': np.str_,
'unicode': 'str',
'bytes': np.bytes_,
'string': np.str_,
'string': 'str',
'integer': np.int64,
'floating': np.float64,
'decimal': np.object_,
Expand Down
29 changes: 22 additions & 7 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,10 +349,18 @@ def test_integer_index_column(self):
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
_check_pandas_roundtrip(df, preserve_index=True)

def test_index_metadata_field_name(self, request):
if _pandas_api.uses_string_dtype():
# https://github.com/pandas-dev/pandas/issues/59879
request.applymarker(pytest.mark.xfail(reason="bug in pandas string dtype"))
def test_float_column_index_with_missing(self):
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=[1.5, np.nan])
_check_pandas_roundtrip(df, preserve_index=True)

@pytest.mark.filterwarnings(
"ignore:The DataFrame has column names of mixed type:UserWarning"
)
def test_string_column_index_with_missing(self):
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=["A", None])
_check_pandas_roundtrip(df, preserve_index=True)

def test_index_metadata_field_name(self):
# test None case, and strangely named non-index columns
df = pd.DataFrame(
[(1, 'a', 3.1), (2, 'b', 2.2), (3, 'c', 1.3)],
Expand All @@ -362,17 +370,24 @@ def test_index_metadata_field_name(self, request):
),
columns=['a', None, '__index_level_0__'],
)
with pytest.warns(UserWarning):
if _pandas_api.uses_string_dtype():
t = pa.Table.from_pandas(df, preserve_index=True)
else:
with pytest.warns(UserWarning):
t = pa.Table.from_pandas(df, preserve_index=True)
js = t.schema.pandas_metadata

col1, col2, col3, idx0, foo = js['columns']

assert col1['name'] == 'a'
assert col1['name'] == col1['field_name']

assert col2['name'] is None
assert col2['field_name'] == 'None'
if _pandas_api.uses_string_dtype():
assert np.isnan(col2['name'])
assert col2['field_name'] == 'nan'
else:
assert col2['name'] is None
assert col2['field_name'] == 'None'

assert col3['name'] == '__index_level_0__'
assert col3['name'] == col3['field_name']
Expand Down

0 comments on commit 762b554

Please sign in to comment.