Skip to content

ENH: Synchronize large parts of IO with pandas #160

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pandas-stubs/_libs/lib.pyi
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
from __future__ import annotations

from enum import Enum

no_default = None

from typing import Literal

class _NoDefault(Enum):
no_default = ...

NoDefault = Literal[_NoDefault.no_default]

def infer_dtype(value: object, skipna: bool = ...) -> str: ...
24 changes: 23 additions & 1 deletion pandas-stubs/_typing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ from typing import (
AnyStr,
Callable,
Hashable,
Iterator,
Literal,
Mapping,
Optional,
Expand Down Expand Up @@ -63,10 +64,29 @@ DtypeObj = Union[np.dtype[np.generic], ExtensionDtype]
AnyStr_cov = TypeVar("AnyStr_cov", str, bytes, covariant=True)
AnyStr_con = TypeVar("AnyStr_con", str, bytes, contravariant=True)

class BaseBuffer(Protocol): ...
class BaseBuffer(Protocol):
@property
def mode(self) -> str: ...
def fileno(self) -> int: ...
def seek(self, __offset: int, __whence: int = ...) -> int: ...
def seekable(self) -> bool: ...
def tell(self) -> int: ...

class ReadBuffer(BaseBuffer, Protocol[AnyStr_cov]): ...
class WriteBuffer(BaseBuffer, Protocol[AnyStr_cov]): ...

class ReadPickleBuffer(ReadBuffer[bytes], Protocol):
def readline(self) -> AnyStr_cov: ...

class WriteExcelBuffer(WriteBuffer[bytes], Protocol):
def truncate(self, size: int | None = ...) -> int: ...

class ReadCsvBuffer(ReadBuffer[AnyStr_cov], Protocol):
def __iter__(self) -> Iterator[AnyStr_cov]: ...
def readline(self) -> AnyStr_cov: ...
@property
def closed(self) -> bool: ...

FilePath = Union[str, PathLike[str]]

Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]
Expand Down Expand Up @@ -208,4 +228,6 @@ GroupByObjectNonScalar = Union[
]
GroupByObject = Union[Scalar, GroupByObjectNonScalar]

CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]

__all__ = ["npt", "type_t"]
5 changes: 0 additions & 5 deletions pandas-stubs/io/clipboard/__init__.pyi
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
from __future__ import annotations

class PyperclipException(RuntimeError): ...

class PyperclipWindowsException(PyperclipException):
def __init__(self, message) -> None: ...

class CheckedCall:
def __init__(self, f) -> None: ...
def __call__(self, *args): ...
Expand Down
8 changes: 5 additions & 3 deletions pandas-stubs/io/clipboards.pyi
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

from pandas.core.frame import DataFrame
from typing import Any

def read_clipboard(sep: str = ..., **kwargs) -> DataFrame: ...
def to_clipboard(obj, excel: bool = ..., sep=..., **kwargs) -> None: ...
def read_clipboard(sep: str = ..., **kwargs): ...
def to_clipboard(
obj, excel: bool | None = ..., sep: str | None = ..., **kwargs: Any
) -> None: ...
174 changes: 137 additions & 37 deletions pandas-stubs/io/common.pyi
Original file line number Diff line number Diff line change
@@ -1,65 +1,165 @@
from __future__ import annotations

from collections import abc
from io import BytesIO
from abc import (
ABC,
ABCMeta,
abstractmethod,
)
from io import (
BytesIO,
StringIO,
TextIOBase,
)
from pathlib import Path
import tarfile
from typing import (
IO,
Any,
AnyStr,
Mapping,
Generic,
Literal,
TypeVar,
overload,
)
import zipfile

from pandas._typing import FilePathOrBuffer
from pandas._typing import (
BaseBuffer,
CompressionDict,
CompressionOptions,
FilePath,
ReadBuffer,
StorageOptions,
WriteBuffer,
)

_BaseBufferT = TypeVar("_BaseBufferT", bound=BaseBuffer)

class IOArgs:
filepath_or_buffer: str | BaseBuffer
encoding: str
mode: str
compression: CompressionDict
should_close: bool
def __init__(
self, filepath_or_buffer, encoding, mode, compression, should_close
) -> None: ...

lzma = ...
class IOHandles(Generic[AnyStr]):
handle: IO[AnyStr]
compression: CompressionDict
created_handles: list[IO[bytes] | IO[str]]
is_wrapped: bool
def close(self) -> None: ...
def __enter__(self) -> IOHandles[AnyStr]: ...
def __exit__(self, *args: object) -> None: ...
def __init__(self, handle, compression, created_handles, is_wrapped) -> None: ...

def is_url(url) -> bool: ...
def validate_header_arg(header) -> None: ...
def is_url(url: object) -> bool: ...
def validate_header_arg(header: object) -> None: ...
@overload
def stringify_path(
filepath_or_buffer: FilePathOrBuffer[AnyStr],
) -> FilePathOrBuffer[AnyStr]: ...
def is_s3_url(url) -> bool: ...
def is_gcs_url(url) -> bool: ...
def urlopen(*args, **kwargs) -> IO: ...
def get_filepath_or_buffer(
filepath_or_buffer: FilePathOrBuffer,
encoding: str | None = ...,
compression: str | None = ...,
mode: str | None = ...,
): ...
filepath_or_buffer: FilePath, convert_file_like: bool = ...
) -> str: ...
@overload
def stringify_path(
filepath_or_buffer: _BaseBufferT, convert_file_like: bool = ...
) -> _BaseBufferT: ...
def urlopen(*args, **kwargs): ...
def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: ...
def file_path_to_url(path: str) -> str: ...
def get_compression_method(
compression: str | Mapping[str, str] | None
) -> tuple[str | None, dict[str, str]]: ...
compression: CompressionOptions,
) -> tuple[str | None, CompressionDict]: ...
def infer_compression(
filepath_or_buffer: FilePathOrBuffer, compression: str | None
filepath_or_buffer: FilePath | BaseBuffer, compression: str | None
) -> str | None: ...
def check_parent_directory(path: Path | str) -> None: ...
@overload
def get_handle(
path_or_buf,
path_or_buf: FilePath | BaseBuffer,
mode: str,
encoding=...,
compression: str | Mapping[str, Any] | None = ...,
*,
encoding: str | None = ...,
compression: CompressionOptions = ...,
memory_map: bool = ...,
is_text: Literal[False],
errors: str | None = ...,
storage_options: StorageOptions = ...,
) -> IOHandles[bytes]: ...
@overload
def get_handle(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, only classes/functions/... listed on this page are considered to be public https://pandas.pydata.org/docs/reference/index.html Since there are many classes/functions that should be public but are not listed there, people seem to assume that classes/functions that do not start with _ are by common conventions public.

As far as I know, nothing in io/common is meant to be public. Adding it here might suggest that it is public. That is obviously a pandas issue but it would be great to discuss some guidelines of what should/shouldn't be in the stubs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If that is the goal then it is probably a lot more doable. The stubs now look like they were generated using an earlier version of stubgen. They have a lot of wrong things in them, including entire files that have no corresponding code in pandas.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, only classes/functions/... listed on this page are considered to be public https://pandas.pydata.org/docs/reference/index.html Since there are many classes/functions that should be public but are not listed there, people seem to assume that classes/functions that do not start with _ are by common conventions public.

As far as I know, nothing in io/common is meant to be public. Adding it here might suggest that it is public. That is obviously a pandas issue but it would be great to discuss some guidelines of what should/shouldn't be in the stubs.

We could have that discussion outside this issue. Do you want to start an issue or a discussion on this topic? I have some thoughts on it, but would rather discuss it in a more focused area on that topic.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If that is the goal then it is probably a lot more doable. The stubs now look like they were generated using an earlier version of stubgen. They have a lot of wrong things in them, including entire files that have no corresponding code in pandas.

Yes, this is a historical artifact. These stubs were generated by Microsoft, with stubgen, and possibly on pandas 1.1 or 1.2. I used them heavily when they were only shipped with VS Code, and kept doing PR's there to make my team's code pass things, and then we had discussions in our monthly pandas dev meetings about how to move forward, given there was another effort for stubs that had testing (which is where the tests here came from). Net result are these stubs, which we all knew would take a lot of work to get right, but provided a good starting point. Didn't want to wait to make them "perfect".

path_or_buf: FilePath | BaseBuffer,
mode: str,
*,
encoding: str | None = ...,
compression: CompressionOptions = ...,
memory_map: bool = ...,
is_text: Literal[True] = ...,
errors: str | None = ...,
storage_options: StorageOptions = ...,
) -> IOHandles[str]: ...
@overload
def get_handle(
path_or_buf: FilePath | BaseBuffer,
mode: str,
*,
encoding: str | None = ...,
compression: CompressionOptions = ...,
memory_map: bool = ...,
is_text: bool = ...,
): ...
errors: str | None = ...,
storage_options: StorageOptions = ...,
) -> IOHandles[str] | IOHandles[bytes]: ...

# ignore similar to what is in pandas source
class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore[misc]
archive_name = ...
class _BufferedWriter(BytesIO, ABC, metaclass=ABCMeta):
@abstractmethod
def write_to_buffer(self) -> None: ...
def close(self) -> None: ...
def __enter__(self) -> _BufferedWriter: ...

class _BytesTarFile(_BufferedWriter):
archive_name: str | None
name: str
buffer: tarfile.TarFile
def __init__(
self,
file: FilePathOrBuffer,
name: str | None = ...,
mode: Literal["r", "a", "w", "x"] = ...,
fileobj: ReadBuffer[bytes] | WriteBuffer[bytes] | None = ...,
archive_name: str | None = ...,
**kwargs,
) -> None: ...
def extend_mode(self, mode: str) -> str: ...
def infer_filename(self) -> str | None: ...
def write_to_buffer(self) -> None: ...

class _BytesZipFile(_BufferedWriter):
archive_name: str | None
buffer: zipfile.ZipFile
def __init__(
self,
file: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
mode: str,
archive_name: str | None = ...,
**kwargs,
) -> None: ...
@property
def closed(self) -> bool: ...
def infer_filename(self) -> str | None: ...
def write_to_buffer(self) -> None: ...

class _MMapWrapper(abc.Iterator):
mmap = ...
def __init__(self, f: IO) -> None: ...
class _IOWrapper:
buffer: BaseBuffer
def __init__(self, buffer: BaseBuffer) -> None: ...
def __getattr__(self, name: str): ...
def __iter__(self) -> _MMapWrapper: ...
def __next__(self) -> str: ...
def readable(self) -> bool: ...
def seekable(self) -> bool: ...
def writable(self) -> bool: ...

class _BytesIOWrapper:
buffer: StringIO | TextIOBase
encoding: str
overflow: bytes
def __init__(self, buffer: StringIO | TextIOBase, encoding: str = ...) -> None: ...
def __getattr__(self, attr: str): ...
def read(self, n: int | None = ...) -> bytes: ...

def file_exists(filepath_or_buffer: FilePath | BaseBuffer) -> bool: ...
12 changes: 8 additions & 4 deletions pandas-stubs/io/date_converters.pyi
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from __future__ import annotations

def parse_date_time(date_col, time_col): ...
def parse_date_fields(year_col, month_col, day_col): ...
import numpy as np

from pandas._typing import npt

def parse_date_time(date_col, time_col) -> npt.NDArray[np.object_]: ...
def parse_date_fields(year_col, month_col, day_col) -> npt.NDArray[np.object_]: ...
def parse_all_fields(
year_col, month_col, day_col, hour_col, minute_col, second_col
): ...
def generic_parser(parse_func, *cols): ...
) -> npt.NDArray[np.object_]: ...
def generic_parser(parse_func, *cols) -> np.ndarray: ...
26 changes: 26 additions & 0 deletions pandas-stubs/io/excel/_odfreader.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from pandas._libs.tslibs.nattype import NaTType
from pandas._typing import (
FilePath,
ReadBuffer,
Scalar,
StorageOptions,
)

from pandas.io.excel._base import BaseExcelReader

class ODFReader(BaseExcelReader):
def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions = ...,
) -> None: ...
def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]): ...
@property
def empty_value(self) -> str: ...
@property
def sheet_names(self) -> list[str]: ...
def get_sheet_by_index(self, index: int): ...
def get_sheet_by_name(self, name: str): ...
def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = ...
) -> list[list[Scalar | NaTType]]: ...
29 changes: 29 additions & 0 deletions pandas-stubs/io/excel/_odswriter.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from typing import Any

from _typeshed import Incomplete

from pandas._typing import (
FilePath,
StorageOptions,
WriteExcelBuffer,
)

from pandas.io.excel._base import ExcelWriter

class ODSWriter(ExcelWriter):
def __init__(
self,
path: FilePath | WriteExcelBuffer | ExcelWriter,
engine: str | None = ...,
date_format: str | None = ...,
datetime_format: Incomplete | None = ...,
mode: str = ...,
storage_options: StorageOptions = ...,
if_sheet_exists: str | None = ...,
engine_kwargs: dict[str, Any] | None = ...,
**kwargs,
) -> None: ...
@property
def book(self): ...
@property
def sheets(self) -> dict[str, Any]: ...
Loading