Skip to content

ENH: Synchronize large parts of IO with pandas #160

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion pandas-stubs/_typing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,20 @@ DtypeObj = Union[np.dtype[np.generic], ExtensionDtype]
AnyStr_cov = TypeVar("AnyStr_cov", str, bytes, covariant=True)
AnyStr_con = TypeVar("AnyStr_con", str, bytes, contravariant=True)

class BaseBuffer(Protocol): ...
class BaseBuffer(Protocol):
@property
def mode(self) -> str: ...
def fileno(self) -> int: ...
def seek(self, __offset: int, __whence: int = ...) -> int: ...
def seekable(self) -> bool: ...
def tell(self) -> int: ...

class ReadBuffer(BaseBuffer, Protocol[AnyStr_cov]): ...
class WriteBuffer(BaseBuffer, Protocol[AnyStr_cov]): ...

class ReadPickleBuffer(ReadBuffer[bytes], Protocol):
def readline(self) -> AnyStr_cov: ...

FilePath = Union[str, PathLike[str]]

Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]
Expand Down
5 changes: 0 additions & 5 deletions pandas-stubs/io/clipboard/__init__.pyi
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
from __future__ import annotations

class PyperclipException(RuntimeError): ...

class PyperclipWindowsException(PyperclipException):
def __init__(self, message) -> None: ...

class CheckedCall:
def __init__(self, f) -> None: ...
def __call__(self, *args): ...
Expand Down
8 changes: 5 additions & 3 deletions pandas-stubs/io/clipboards.pyi
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

from pandas.core.frame import DataFrame
from typing import Any

def read_clipboard(sep: str = ..., **kwargs) -> DataFrame: ...
def to_clipboard(obj, excel: bool = ..., sep=..., **kwargs) -> None: ...
def read_clipboard(sep: str = ..., **kwargs): ...
def to_clipboard(
obj, excel: bool | None = ..., sep: str | None = ..., **kwargs: Any
) -> None: ...
174 changes: 137 additions & 37 deletions pandas-stubs/io/common.pyi
Original file line number Diff line number Diff line change
@@ -1,65 +1,165 @@
from __future__ import annotations

from collections import abc
from io import BytesIO
from abc import (
ABC,
ABCMeta,
abstractmethod,
)
from io import (
BytesIO,
StringIO,
TextIOBase,
)
from pathlib import Path
import tarfile
from typing import (
IO,
Any,
AnyStr,
Mapping,
Generic,
Literal,
TypeVar,
overload,
)
import zipfile

from pandas._typing import FilePathOrBuffer
from pandas._typing import (
BaseBuffer,
CompressionDict,
CompressionOptions,
FilePath,
ReadBuffer,
StorageOptions,
WriteBuffer,
)

_BaseBufferT = TypeVar("_BaseBufferT", bound=BaseBuffer)

class IOArgs:
filepath_or_buffer: str | BaseBuffer
encoding: str
mode: str
compression: CompressionDict
should_close: bool
def __init__(
self, filepath_or_buffer, encoding, mode, compression, should_close
) -> None: ...

lzma = ...
class IOHandles(Generic[AnyStr]):
handle: IO[AnyStr]
compression: CompressionDict
created_handles: list[IO[bytes] | IO[str]]
is_wrapped: bool
def close(self) -> None: ...
def __enter__(self) -> IOHandles[AnyStr]: ...
def __exit__(self, *args: object) -> None: ...
def __init__(self, handle, compression, created_handles, is_wrapped) -> None: ...

def is_url(url) -> bool: ...
def validate_header_arg(header) -> None: ...
def is_url(url: object) -> bool: ...
def validate_header_arg(header: object) -> None: ...
@overload
def stringify_path(
filepath_or_buffer: FilePathOrBuffer[AnyStr],
) -> FilePathOrBuffer[AnyStr]: ...
def is_s3_url(url) -> bool: ...
def is_gcs_url(url) -> bool: ...
def urlopen(*args, **kwargs) -> IO: ...
def get_filepath_or_buffer(
filepath_or_buffer: FilePathOrBuffer,
encoding: str | None = ...,
compression: str | None = ...,
mode: str | None = ...,
): ...
filepath_or_buffer: FilePath, convert_file_like: bool = ...
) -> str: ...
@overload
def stringify_path(
filepath_or_buffer: _BaseBufferT, convert_file_like: bool = ...
) -> _BaseBufferT: ...
def urlopen(*args, **kwargs): ...
def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: ...
def file_path_to_url(path: str) -> str: ...
def get_compression_method(
compression: str | Mapping[str, str] | None
) -> tuple[str | None, dict[str, str]]: ...
compression: CompressionOptions,
) -> tuple[str | None, CompressionDict]: ...
def infer_compression(
filepath_or_buffer: FilePathOrBuffer, compression: str | None
filepath_or_buffer: FilePath | BaseBuffer, compression: str | None
) -> str | None: ...
def check_parent_directory(path: Path | str) -> None: ...
@overload
def get_handle(
path_or_buf,
path_or_buf: FilePath | BaseBuffer,
mode: str,
encoding=...,
compression: str | Mapping[str, Any] | None = ...,
*,
encoding: str | None = ...,
compression: CompressionOptions = ...,
memory_map: bool = ...,
is_text: Literal[False],
errors: str | None = ...,
storage_options: StorageOptions = ...,
) -> IOHandles[bytes]: ...
@overload
def get_handle(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, only classes/functions/... listed on this page are considered to be public https://pandas.pydata.org/docs/reference/index.html Since there are many classes/functions that should be public but are not listed there, people seem to assume that classes/functions that do not start with _ are by common conventions public.

As far as I know, nothing in io/common is meant to be public. Adding it here might suggest that it is public. That is obviously a pandas issue but it would be great to discuss some guidelines of what should/shouldn't be in the stubs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If that is the goal then it is probably a lot more doable. The stubs now look like they were generated using an earlier version of stubgen. They have a lot of wrong things in them, including entire files that have no corresponding code in pandas.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, only classes/functions/... listed on this page are considered to be public https://pandas.pydata.org/docs/reference/index.html Since there are many classes/functions that should be public but are not listed there, people seem to assume that classes/functions that do not start with _ are by common conventions public.

As far as I know, nothing in io/common is meant to be public. Adding it here might suggest that it is public. That is obviously a pandas issue but it would be great to discuss some guidelines of what should/shouldn't be in the stubs.

We could have that discussion outside this issue. Do you want to start an issue or a discussion on this topic? I have some thoughts on it, but would rather discuss it in a more focused area on that topic.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If that is the goal then it is probably a lot more doable. The stubs now look like they were generated using an earlier version of stubgen. They have a lot of wrong things in them, including entire files that have no corresponding code in pandas.

Yes, this is a historical artifact. These stubs were generated by Microsoft, with stubgen, and possibly on pandas 1.1 or 1.2. I used them heavily when they were only shipped with VS Code, and kept doing PR's there to make my team's code pass things, and then we had discussions in our monthly pandas dev meetings about how to move forward, given there was another effort for stubs that had testing (which is where the tests here came from). Net result are these stubs, which we all knew would take a lot of work to get right, but provided a good starting point. Didn't want to wait to make them "perfect".

path_or_buf: FilePath | BaseBuffer,
mode: str,
*,
encoding: str | None = ...,
compression: CompressionOptions = ...,
memory_map: bool = ...,
is_text: Literal[True] = ...,
errors: str | None = ...,
storage_options: StorageOptions = ...,
) -> IOHandles[str]: ...
@overload
def get_handle(
path_or_buf: FilePath | BaseBuffer,
mode: str,
*,
encoding: str | None = ...,
compression: CompressionOptions = ...,
memory_map: bool = ...,
is_text: bool = ...,
): ...
errors: str | None = ...,
storage_options: StorageOptions = ...,
) -> IOHandles[str] | IOHandles[bytes]: ...

# ignore similar to what is in pandas source
class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore[misc]
archive_name = ...
class _BufferedWriter(BytesIO, ABC, metaclass=ABCMeta):
@abstractmethod
def write_to_buffer(self) -> None: ...
def close(self) -> None: ...
def __enter__(self) -> _BufferedWriter: ...

class _BytesTarFile(_BufferedWriter):
archive_name: str | None
name: str
buffer: tarfile.TarFile
def __init__(
self,
file: FilePathOrBuffer,
name: str | None = ...,
mode: Literal["r", "a", "w", "x"] = ...,
fileobj: ReadBuffer[bytes] | WriteBuffer[bytes] | None = ...,
archive_name: str | None = ...,
**kwargs,
) -> None: ...
def extend_mode(self, mode: str) -> str: ...
def infer_filename(self) -> str | None: ...
def write_to_buffer(self) -> None: ...

class _BytesZipFile(_BufferedWriter):
archive_name: str | None
buffer: zipfile.ZipFile
def __init__(
self,
file: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
mode: str,
archive_name: str | None = ...,
**kwargs,
) -> None: ...
@property
def closed(self) -> bool: ...
def infer_filename(self) -> str | None: ...
def write_to_buffer(self) -> None: ...

class _MMapWrapper(abc.Iterator):
mmap = ...
def __init__(self, f: IO) -> None: ...
class _IOWrapper:
buffer: BaseBuffer
def __init__(self, buffer: BaseBuffer) -> None: ...
def __getattr__(self, name: str): ...
def __iter__(self) -> _MMapWrapper: ...
def __next__(self) -> str: ...
def readable(self) -> bool: ...
def seekable(self) -> bool: ...
def writable(self) -> bool: ...

class _BytesIOWrapper:
buffer: StringIO | TextIOBase
encoding: str
overflow: bytes
def __init__(self, buffer: StringIO | TextIOBase, encoding: str = ...) -> None: ...
def __getattr__(self, attr: str): ...
def read(self, n: int | None = ...) -> bytes: ...

def file_exists(filepath_or_buffer: FilePath | BaseBuffer) -> bool: ...
12 changes: 8 additions & 4 deletions pandas-stubs/io/date_converters.pyi
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from __future__ import annotations

def parse_date_time(date_col, time_col): ...
def parse_date_fields(year_col, month_col, day_col): ...
import numpy as np

from pandas._typing import npt as npt

def parse_date_time(date_col, time_col) -> npt.NDArray[np.object_]: ...
def parse_date_fields(year_col, month_col, day_col) -> npt.NDArray[np.object_]: ...
def parse_all_fields(
year_col, month_col, day_col, hour_col, minute_col, second_col
): ...
def generic_parser(parse_func, *cols): ...
) -> npt.NDArray[np.object_]: ...
def generic_parser(parse_func, *cols) -> np.ndarray: ...
23 changes: 17 additions & 6 deletions pandas-stubs/io/feather_format.pyi
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
from __future__ import annotations

# from pandas import DataFrame, Int64Index, RangeIndex
from typing import Sequence

from pandas.core.frame import DataFrame

from pandas._typing import FilePathOrBuffer
from pandas._typing import (
FilePath,
HashableT,
ReadBuffer,
StorageOptions,
WriteBuffer,
)

def to_feather(df: DataFrame, path): ...
def to_feather(
df: DataFrame,
path: FilePath | WriteBuffer[bytes],
storage_options: StorageOptions = ...,
**kwargs,
) -> None: ...
def read_feather(
p: FilePathOrBuffer, columns: Sequence | None = ..., use_threads: bool = ...
path: FilePath | ReadBuffer[bytes],
columns: list[HashableT] | None = ...,
use_threads: bool = ...,
storage_options: StorageOptions = ...,
): ...
11 changes: 5 additions & 6 deletions pandas-stubs/io/gbq.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ def read_gbq(
dialect: str | None = ...,
location: str | None = ...,
configuration: dict[str, Any] | None = ...,
credentials=...,
# Credentials is a google type, use Any since unavailable
credentials: Any | None = ...,
use_bqstorage_api: bool | None = ...,
private_key=...,
verbose=...,
max_results: int | None = ...,
progress_bar_type: str | None = ...,
) -> DataFrame: ...
def to_gbq(
Expand All @@ -31,7 +31,6 @@ def to_gbq(
table_schema: list[dict[str, str]] | None = ...,
location: str | None = ...,
progress_bar: bool = ...,
credentials=...,
verbose=...,
private_key=...,
# Credentials is a google type, use Any since unavailable
credentials: Any | None = ...,
) -> None: ...
9 changes: 0 additions & 9 deletions pandas-stubs/io/gcs.pyi

This file was deleted.

Loading