modin-project
diff --git a/‎modin/backends/pandas/parsers.py
+56 b/‎modin/backends/pandas/parsers.py
+56
diff --git a/‎modin/engines/base/io/__init__.py
+15-1 b/‎modin/engines/base/io/__init__.py
+15-1
diff --git a/‎modin/engines/base/io/column_stores/__init__.py b/‎modin/engines/base/io/column_stores/__init__.py
diff --git a/‎modin/engines/base/io/column_stores/column_store_reader.py
+102 b/‎modin/engines/base/io/column_stores/column_store_reader.py
+102
diff --git a/‎modin/engines/base/io/column_stores/feather_reader.py
+26 b/‎modin/engines/base/io/column_stores/feather_reader.py
+26
diff --git a/‎modin/engines/base/io/column_stores/hdf_reader.py
+50 b/‎modin/engines/base/io/column_stores/hdf_reader.py
+50
diff --git a/‎modin/engines/base/io/column_stores/parquet_reader.py
+62 b/‎modin/engines/base/io/column_stores/parquet_reader.py
+62
diff --git a/‎modin/engines/base/io/sql/__init__.py b/‎modin/engines/base/io/sql/__init__.py
@@ -123,3 +123,59 @@ def parse(fname, **kwargs):
             pandas_df.dtypes,
             partition_columns,
         ]
+
+
+class PandasParquetParser(PandasParser):
+    @staticmethod
+    def parse(fname, **kwargs):
+        import pyarrow.parquet as pq
+
+        num_splits = kwargs.pop("num_splits", None)
+        columns = kwargs.get("columns", None)
+        if num_splits is None:
+            return pandas.read_parquet(fname, **kwargs)
+        kwargs["use_pandas_metadata"] = True
+        df = pq.read_table(fname, **kwargs).to_pandas()
+        if columns is not None:
+            df = df[columns]
+        # Append the length of the index here to build it externally
+        return _split_result_for_readers(0, num_splits, df) + [len(df.index), df.dtypes]
+
+
+class PandasHDFParser(PandasParser):  # pragma: no cover
+    @staticmethod
+    def parse(fname, **kwargs):
+        kwargs["key"] = kwargs.pop("_key", None)
+        num_splits = kwargs.pop("num_splits", None)
+        if num_splits is None:
+            return pandas.read_hdf(fname, **kwargs)
+        df = pandas.read_hdf(fname, **kwargs)
+        # Append the length of the index here to build it externally
+        return _split_result_for_readers(0, num_splits, df) + [len(df.index), df.dtypes]
+
+
+class PandasFeatherParser(PandasParser):
+    @staticmethod
+    def parse(fname, **kwargs):
+        from pyarrow import feather
+
+        num_splits = kwargs.pop("num_splits", None)
+        if num_splits is None:
+            return pandas.read_feather(fname, **kwargs)
+        df = feather.read_feather(fname, **kwargs)
+        # Append the length of the index here to build it externally
+        return _split_result_for_readers(0, num_splits, df) + [len(df.index), df.dtypes]
+
+
+class PandasSQLParser(PandasParser):
+    @staticmethod
+    def parse(sql, con, index_col, **kwargs):
+        num_splits = kwargs.pop("num_splits", None)
+        if num_splits is None:
+            return pandas.read_sql(sql, con, index_col=index_col, **kwargs)
+        df = pandas.read_sql(sql, con, index_col=index_col, **kwargs)
+        if index_col is None:
+            index = len(df)
+        else:
+            index = df.index
+        return _split_result_for_readers(1, num_splits, df) + [index, df.dtypes]
@@ -3,5 +3,19 @@
 from modin.engines.base.io.text.json_reader import JSONReader
 from modin.engines.base.io.file_reader import FileReader
 from modin.engines.base.io.text.text_file_reader import TextFileReader
+from modin.engines.base.io.column_stores.parquet_reader import ParquetReader
+from modin.engines.base.io.column_stores.hdf_reader import HDFReader
+from modin.engines.base.io.column_stores.feather_reader import FeatherReader
+from modin.engines.base.io.sql.sql_reader import SQLReader
 
-__all__ = ["BaseIO", "CSVReader", "JSONReader", "FileReader", "TextFileReader"]
+__all__ = [
+    "BaseIO",
+    "CSVReader",
+    "JSONReader",
+    "FileReader",
+    "TextFileReader",
+    "ParquetReader",
+    "HDFReader",
+    "FeatherReader",
+    "SQLReader",
+]
@@ -0,0 +1,102 @@
+import numpy as np
+import pandas
+
+from modin.data_management.utils import compute_chunksize
+from modin.engines.base.io.file_reader import FileReader
+
+
+class ColumnStoreReader(FileReader):
+    @classmethod
+    def call_deploy(cls, fname, col_partitions, **kwargs):
+        from modin.pandas import DEFAULT_NPARTITIONS
+
+        return np.array(
+            [
+                cls.deploy(
+                    cls.parse,
+                    DEFAULT_NPARTITIONS + 2,
+                    dict(
+                        fname=fname,
+                        columns=cols,
+                        num_splits=DEFAULT_NPARTITIONS,
+                        **kwargs
+                    ),
+                )
+                for cols in col_partitions
+            ]
+        ).T
+
+    @classmethod
+    def build_partition(cls, partition_ids, row_lengths, column_widths):
+        return np.array(
+            [
+                [
+                    cls.frame_partition_cls(
+                        partition_ids[i][j],
+                        length=row_lengths[i],
+                        width=column_widths[j],
+                    )
+                    for j in range(len(partition_ids[i]))
+                ]
+                for i in range(len(partition_ids))
+            ]
+        )
+
+    @classmethod
+    def build_index(cls, partition_ids):
+        from modin.pandas import DEFAULT_NPARTITIONS
+
+        index_len = cls.materialize(partition_ids[-2][0])
+        index = pandas.RangeIndex(index_len)
+        index_chunksize = compute_chunksize(
+            pandas.DataFrame(index=index), DEFAULT_NPARTITIONS, axis=0
+        )
+        if index_chunksize > index_len:
+            row_lengths = [index_len] + [0 for _ in range(DEFAULT_NPARTITIONS - 1)]
+        else:
+            row_lengths = [
+                index_chunksize
+                if i != DEFAULT_NPARTITIONS - 1
+                else index_len - (index_chunksize * (DEFAULT_NPARTITIONS - 1))
+                for i in range(DEFAULT_NPARTITIONS)
+            ]
+        return index, row_lengths
+
+    @classmethod
+    def build_columns(cls, columns):
+        from modin.pandas import DEFAULT_NPARTITIONS
+
+        column_splits = (
+            len(columns) // DEFAULT_NPARTITIONS
+            if len(columns) % DEFAULT_NPARTITIONS == 0
+            else len(columns) // DEFAULT_NPARTITIONS + 1
+        )
+        col_partitions = [
+            columns[i : i + column_splits]
+            for i in range(0, len(columns), column_splits)
+        ]
+        column_widths = [len(c) for c in col_partitions]
+        return col_partitions, column_widths
+
+    @classmethod
+    def build_dtypes(cls, partition_ids, columns):
+        # Compute dtypes concatenating the results from each of the columns splits
+        # determined above. This creates a pandas Series that contains a dtype for every
+        # column.
+        dtypes = pandas.concat(cls.materialize(list(partition_ids)), axis=0)
+        dtypes.index = columns
+        return dtypes
+
+    @classmethod
+    def build_query_compiler(cls, path, columns, **kwargs):
+        col_partitions, column_widths = cls.build_columns(columns)
+        partition_ids = cls.call_deploy(path, col_partitions, **kwargs)
+        index, row_lens = cls.build_index(partition_ids)
+        remote_parts = cls.build_partition(partition_ids[:-2], row_lens, column_widths)
+        dtypes = cls.build_dtypes(partition_ids[-1], columns)
+        new_query_compiler = cls.query_compiler_cls(
+            cls.frame_cls(
+                remote_parts, index, columns, row_lens, column_widths, dtypes=dtypes,
+            )
+        )
+        return new_query_compiler
@@ -0,0 +1,26 @@
+from modin.engines.base.io.column_stores.column_store_reader import ColumnStoreReader
+
+
+class FeatherReader(ColumnStoreReader):
+    @classmethod
+    def read(cls, path, columns=None, **kwargs):
+        """Read a pandas.DataFrame from Feather format.
+           Ray DataFrame only supports pyarrow engine for now.
+
+        Args:
+            path: The filepath of the feather file.
+                  We only support local files for now.
+                multi threading is set to True by default
+            columns: not supported by pandas api, but can be passed here to read only
+                specific columns
+
+        Notes:
+            pyarrow feather is used. Please refer to the documentation here
+            https://arrow.apache.org/docs/python/api.html#feather-format
+        """
+        if columns is None:
+            from pyarrow.feather import FeatherReader
+
+            fr = FeatherReader(path)
+            columns = [fr.get_column_name(i) for i in range(fr.num_columns)]
+        return cls.build_query_compiler(path, columns, use_threads=False)
@@ -0,0 +1,50 @@
+import pandas
+
+from modin.engines.base.io.column_stores.column_store_reader import ColumnStoreReader
+from modin.error_message import ErrorMessage
+
+
+class HDFReader(ColumnStoreReader):  # pragma: no cover
+    @classmethod
+    def _validate_hdf_format(cls, path_or_buf):
+        s = pandas.HDFStore(path_or_buf)
+        groups = s.groups()
+        if len(groups) == 0:
+            raise ValueError("No dataset in HDF5 file.")
+        candidate_only_group = groups[0]
+        format = getattr(candidate_only_group._v_attrs, "table_type", None)
+        s.close()
+        return format
+
+    @classmethod
+    def read(cls, path_or_buf, **kwargs):
+        """Load a h5 file from the file path or buffer, returning a DataFrame.
+
+        Args:
+            path: string, buffer or path object
+                Path to the file to open, or an open :class:`pandas.HDFStore` object.
+            kwargs: Pass into pandas.read_hdf function.
+
+        Returns:
+            DataFrame constructed from the h5 file.
+        """
+        if cls._validate_hdf_format(path_or_buf=path_or_buf) is None:
+            ErrorMessage.default_to_pandas(
+                "File format seems to be `fixed`. For better distribution consider "
+                "saving the file in `table` format. df.to_hdf(format=`table`)."
+            )
+            return cls.single_worker_read(path_or_buf, **kwargs)
+
+        columns = kwargs.pop("columns", None)
+        # Have to do this because of Dask's keyword arguments
+        kwargs["_key"] = kwargs.pop("key", None)
+        if not columns:
+            start = kwargs.pop("start", None)
+            stop = kwargs.pop("stop", None)
+            empty_pd_df = pandas.read_hdf(path_or_buf, start=0, stop=0, **kwargs)
+            if start is not None:
+                kwargs["start"] = start
+            if stop is not None:
+                kwargs["stop"] = stop
+            columns = empty_pd_df.columns
+        return cls.build_query_compiler(path_or_buf, columns, **kwargs)
@@ -0,0 +1,62 @@
+import os
+
+from modin.engines.base.io.column_stores.column_store_reader import ColumnStoreReader
+from modin.error_message import ErrorMessage
+
+
+class ParquetReader(ColumnStoreReader):
+    @classmethod
+    def read(cls, path, engine, columns, **kwargs):
+        """Load a parquet object from the file path, returning a DataFrame.
+           Ray DataFrame only supports pyarrow engine for now.
+
+        Args:
+            path: The filepath of the parquet file.
+                  We only support local files for now.
+            engine: Ray only support pyarrow reader.
+                    This argument doesn't do anything for now.
+            kwargs: Pass into parquet's read_pandas function.
+
+        Notes:
+            ParquetFile API is used. Please refer to the documentation here
+            https://arrow.apache.org/docs/python/parquet.html
+        """
+        from pyarrow.parquet import ParquetFile, ParquetDataset
+        from modin.pandas.io import PQ_INDEX_REGEX
+
+        if os.path.isdir(path):
+            partitioned_columns = set()
+            directory = True
+            original_path = path
+            # We do a tree walk of the path directory because partitioned
+            # parquet directories have a unique column at each directory level.
+            # Thus, we can use os.walk(), which does a dfs search, to walk
+            # through the different columns that the data is partitioned on
+            for (root, dir_names, files) in os.walk(path):
+                if dir_names:
+                    partitioned_columns.add(dir_names[0].split("=")[0])
+                if files:
+                    # Metadata files, git files, .DSStore
+                    if files[0][0] == ".":
+                        continue
+                    path = os.path.join(root, files[0])
+                    break
+            partitioned_columns = list(partitioned_columns)
+            if len(partitioned_columns):
+                ErrorMessage.default_to_pandas("Partitioned Columns in Parquet")
+                return cls.single_worker_read(
+                    original_path, engine=engine, columns=columns, **kwargs
+                )
+        else:
+            directory = False
+
+        if not columns:
+            if directory:
+                # Path of the sample file that we will read to get the remaining columns
+                pd = ParquetDataset(path)
+                column_names = pd.schema.names
+            else:
+                pf = ParquetFile(path)
+                column_names = pf.metadata.schema.names
+            columns = [name for name in column_names if not PQ_INDEX_REGEX.match(name)]
+        return cls.build_query_compiler(path, columns, **kwargs)