Handling ObjetctID after to_pandas #242

frbelotto · 2024-10-14T15:06:18Z

Hello guys,
I am importing a big dataset from mongo:

pd_confirmacao_conversao = find_arrow_all(pd_confirmacao_conversao, {'estadoContabilizacaoEvento': { '$lt': 100}})

After that, I´ve just exported it to a pandas dataframe

pd_confirmacao_conversao = pd_confirmacao_conversao.to_pandas()

Mu issue is that my original dataframe contains two columns that contains ObjectIds ('_id' and 'referenciaConversao').
Because of that I try to run a df.info, it crashes!

pd_confirmacao_conversao.info()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 pd_confirmacao_conversao.info()

File /projeto/libs/lib/python3.11/site-packages/pandas/core/frame.py:3659, in DataFrame.info(self, verbose, buf, max_cols, memory_usage, show_counts)
   3646 @doc(INFO_DOCSTRING, **frame_sub_kwargs)
   3647 def info(
   3648     self,
   (...)
   3653     show_counts: bool | None = None,
   3654 ) -> None:
   3655     info = DataFrameInfo(
   3656         data=self,
   3657         memory_usage=memory_usage,
   3658     )
-> 3659     info.render(
   3660         buf=buf,
   3661         max_cols=max_cols,
   3662         verbose=verbose,
   3663         show_counts=show_counts,
   3664     )

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:512, in DataFrameInfo.render(self, buf, max_cols, verbose, show_counts)
    498 def render(
    499     self,
    500     *,
   (...)
    504     show_counts: bool | None,
    505 ) -> None:
    506     printer = _DataFrameInfoPrinter(
    507         info=self,
    508         max_cols=max_cols,
    509         verbose=verbose,
    510         show_counts=show_counts,
    511     )
--> 512     printer.to_buffer(buf)

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:583, in _InfoPrinterAbstract.to_buffer(self, buf)
    581 """Save dataframe info into buffer."""
    582 table_builder = self._create_table_builder()
--> 583 lines = table_builder.get_lines()
    584 if buf is None:  # pragma: no cover
    585     buf = sys.stdout

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:790, in _DataFrameTableBuilder.get_lines(self)
    788     self._fill_empty_info()
    789 else:
--> 790     self._fill_non_empty_info()
    791 return self._lines

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:960, in _DataFrameTableBuilderVerbose._fill_non_empty_info(self)
    958 self.add_dtypes_line()
    959 if self.display_memory_usage:
--> 960     self.add_memory_usage_line()

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:820, in _DataFrameTableBuilder.add_memory_usage_line(self)
    818 def add_memory_usage_line(self) -> None:
    819     """Add line containing memory usage."""
--> 820     self._lines.append(f"memory usage: {self.memory_usage_string}")

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:750, in _TableBuilderAbstract.memory_usage_string(self)
    747 @property
    748 def memory_usage_string(self) -> str:
    749     """Memory usage string with proper size qualifier."""
--> 750     return self.info.memory_usage_string

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:413, in _BaseInfo.memory_usage_string(self)
    410 @property
    411 def memory_usage_string(self) -> str:
    412     """Memory usage in a form of human readable string."""
--> 413     return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n"

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:496, in DataFrameInfo.memory_usage_bytes(self)
    493 @property
    494 def memory_usage_bytes(self) -> int:
    495     deep = self.memory_usage == "deep"
--> 496     return self.data.memory_usage(index=True, deep=deep).sum()

File /projeto/libs/lib/python3.11/site-packages/pandas/core/frame.py:3755, in DataFrame.memory_usage(self, index, deep)
   3666 def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
   3667     """
   3668     Return the memory usage of each column in bytes.
   3669 
   (...)
   3753     5244
   3754     """
-> 3755     result = self._constructor_sliced(
   3756         [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
   3757         index=self.columns,
   3758         dtype=np.intp,
   3759     )
   3760     if index:
   3761         index_memory_usage = self._constructor_sliced(
   3762             self.index.memory_usage(deep=deep), index=["Index"]
   3763         )

File /projeto/libs/lib/python3.11/site-packages/pandas/core/series.py:584, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    582         data = data.copy()
    583 else:
--> 584     data = sanitize_array(data, index, dtype, copy)
    586     manager = _get_option("mode.data_manager", silent=True)
    587     if manager == "block":

File /projeto/libs/lib/python3.11/site-packages/pandas/core/construction.py:651, in sanitize_array(data, index, dtype, copy, allow_2d)
    648     subarr = np.array([], dtype=np.float64)
    650 elif dtype is not None:
--> 651     subarr = _try_cast(data, dtype, copy)
    653 else:
    654     subarr = maybe_convert_platform(data)

File /projeto/libs/lib/python3.11/site-packages/pandas/core/construction.py:818, in _try_cast(arr, dtype, copy)
    813 # GH#15832: Check if we are requesting a numeric dtype and
    814 # that we can convert the data to the requested dtype.
    815 elif dtype.kind in "iu":
    816     # this will raise if we have e.g. floats
--> 818     subarr = maybe_cast_to_integer_array(arr, dtype)
    819 elif not copy:
    820     subarr = np.asarray(arr, dtype=dtype)

File /projeto/libs/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1657, in maybe_cast_to_integer_array(arr, dtype)
   1650         if not np_version_gt2:
   1651             warnings.filterwarnings(
   1652                 "ignore",
   1653                 "NumPy will stop allowing conversion of "
   1654                 "out-of-bound Python int",
   1655                 DeprecationWarning,
   1656             )
-> 1657         casted = np.asarray(arr, dtype=dtype)
   1658 else:
   1659     with warnings.catch_warnings():

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'method'

I Can fix it converting the dtype to strings, but I want to understand what should be the expected behavior without converting it.

Thanks in advance!

pandas                            2.2.2
pyarrow                          17.0.0
pymongo                        4.8.0
pymongoarrow               1.5.1

The text was updated successfully, but these errors were encountered:

aclark4life · 2024-10-14T19:32:50Z

Thank you for the question! Tracking in https://jira.mongodb.org/browse/ARROW-256

frbelotto mentioned this issue Oct 14, 2024

Setting parcial Schema to find_arrow_all and find_pandas_all #243

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Handling ObjetctID after to_pandas #242

Handling ObjetctID after to_pandas #242

frbelotto commented Oct 14, 2024

aclark4life commented Oct 14, 2024

Handling ObjetctID after to_pandas #242

Handling ObjetctID after to_pandas #242

Comments

frbelotto commented Oct 14, 2024

aclark4life commented Oct 14, 2024