Allow specifying the delimiter for pandas' CSV loader.

xuhdev · xuhdev · commit dcf157d27428 · 2020-12-14T19:51:52.000-08:00
Close #48
diff --git a/pydax/loaders/_table.py b/pydax/loaders/_table.py
@@ -36,6 +36,7 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic
         :param options:
                - ``columns`` key specifies the data type of each column. Each data type corresponds to a Pandas'
                  supported dtype. If unspecified, then it is default.
+               - ``delimiter`` key specifies the delimiter of the input CSV file.
                - ``encoding`` key specifies the encoding of the CSV file. Defaults to UTF-8.
         :raises TypeError: ``path`` is not a path object.
         """
@@ -54,4 +55,6 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic
             else:
                 dtypes[column] = type_
 
-        return pd.read_csv(path, parse_dates=parse_dates, dtype=dtypes, encoding=options.get('encoding', 'utf-8'))
+        return pd.read_csv(path, parse_dates=parse_dates, dtype=dtypes,
+                           encoding=options.get('encoding', 'utf-8'),
+                           delimiter=options.get('delimiter', ','))
diff --git a/tests/schemata/datasets.yaml b/tests/schemata/datasets.yaml
@@ -77,6 +77,7 @@ datasets:
             id: csv
             options:
               encoding: 'UTF-8'
+              delimiter: ','
               columns:
                 DATE: 'datetime'
                 # Would have been int in pandas if unspecified. Put this here to have some dtype processing code ran more frequently in test code
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
@@ -200,6 +200,26 @@ def test_csv_pandas_column_unsupported_data_types(self, tmp_path, noaa_jfk_schem
         for t in (err_column.dtype, err_column.check):
             assert re.search(rf"{t}(\d*|ing)\b", str(e.value))  # "ing" is for "str'ing'"
 
+    def test_csv_pandas_no_delimiter(self, tmp_path, noaa_jfk_schema):
+        "Test when no delimiter is given."
+        # Remove the delimiter option
+        del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['delimiter']
+        data = Dataset(noaa_jfk_schema, tmp_path,
+                       mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD).data['jfk_weather_cleaned']
+        assert len(data.columns) == 16  # Number of columns remain the same
+
+    @pytest.mark.parametrize('delimiter', ('\t', ' ', '|', ';'))
+    def test_csv_pandas_delimiter(self, tmp_path, noaa_jfk_schema, delimiter):
+        "Test common delimiter settings. Note that the case of comma has been tested in ``test_csv_pandas_loader``."
+
+        del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['columns']
+        # Change delimiter to tab, |, ;, space
+        noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['delimiter'] = delimiter
+        data = Dataset(noaa_jfk_schema, tmp_path,
+                       mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD).data['jfk_weather_cleaned']
+        # None of these delimiters exist in the file, number of columns should be 1
+        assert len(data.columns) == 1
+
     def test_csv_pandas_loader_no_path(self):
         "Test CSVPandasLoader when fed in with non-path."