""" orc compat """ from __future__ import annotations import io from types import ModuleType from typing import ( TYPE_CHECKING, Any, Literal, ) from pandas._typing import ( FilePath, ReadBuffer, WriteBuffer, ) from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import ( is_categorical_dtype, is_interval_dtype, is_period_dtype, is_unsigned_integer_dtype, ) from pandas.io.common import get_handle if TYPE_CHECKING: from pandas import DataFrame def read_orc( path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, **kwargs ) -> DataFrame: """ Load an ORC object from the file path, returning a DataFrame. .. versionadded:: 1.0.0 Parameters ---------- path : str, path object, or file-like object String, path object (implementing ``os.PathLike[str]``), or file-like object implementing a binary ``read()`` function. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.orc``. columns : list, default None If not None, only these columns will be read from the file. **kwargs Any additional kwargs are passed to pyarrow. Returns ------- DataFrame Notes ----- Before using this function you should read the :ref:`user guide about ORC ` and :ref:`install optional dependencies `. """ # we require a newer version of pyarrow than we support for parquet orc = import_optional_dependency("pyarrow.orc") with get_handle(path, "rb", is_text=False) as handles: orc_file = orc.ORCFile(handles.handle) return orc_file.read(columns=columns, **kwargs).to_pandas() def to_orc( df: DataFrame, path: FilePath | WriteBuffer[bytes] | None = None, *, engine: Literal["pyarrow"] = "pyarrow", index: bool | None = None, engine_kwargs: dict[str, Any] | None = None, ) -> bytes | None: """ Write a DataFrame to the ORC format. .. versionadded:: 1.5.0 Parameters ---------- df : DataFrame The dataframe to be written to ORC. Raises NotImplementedError if dtype of one or more columns is category, unsigned integers, intervals, periods or sparse. path : str, file-like object or None, default None If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle (e.g. via builtin open function). If path is None, a bytes object is returned. engine : str, default 'pyarrow' ORC library to use. Pyarrow must be >= 7.0.0. index : bool, optional If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. If ``None``, similar to ``infer`` the dataframe's index(es) will be saved. However, instead of being saved as values, the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. engine_kwargs : dict[str, Any] or None, default None Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. Returns ------- bytes if no path argument is provided else None Raises ------ NotImplementedError Dtype of one or more columns is category, unsigned integers, interval, period or sparse. ValueError engine is not pyarrow. Notes ----- * Before using this function you should read the :ref:`user guide about ORC ` and :ref:`install optional dependencies `. * This function requires `pyarrow `_ library. * For supported dtypes please refer to `supported ORC features in Arrow `__. * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. """ if index is None: index = df.index.names[0] is not None if engine_kwargs is None: engine_kwargs = {} # If unsupported dtypes are found raise NotImplementedError # In Pyarrow 9.0.0 this check will no longer be needed for dtype in df.dtypes: if ( is_categorical_dtype(dtype) or is_interval_dtype(dtype) or is_period_dtype(dtype) or is_unsigned_integer_dtype(dtype) ): raise NotImplementedError( "The dtype of one or more columns is not supported yet." ) if engine != "pyarrow": raise ValueError("engine must be 'pyarrow'") engine = import_optional_dependency(engine, min_version="7.0.0") orc = import_optional_dependency("pyarrow.orc") was_none = path is None if was_none: path = io.BytesIO() assert path is not None # For mypy with get_handle(path, "wb", is_text=False) as handles: assert isinstance(engine, ModuleType) # For mypy try: orc.write_table( engine.Table.from_pandas(df, preserve_index=index), handles.handle, **engine_kwargs, ) except TypeError as e: raise NotImplementedError( "The dtype of one or more columns is not supported yet." ) from e if was_none: assert isinstance(path, io.BytesIO) # For mypy return path.getvalue() return None