Skip to content

Commit dcf157d

Browse files
committed
Allow specifying the delimiter for pandas' CSV loader.
Close #48
1 parent 1e4b44c commit dcf157d

File tree

3 files changed

+25
-1
lines changed

3 files changed

+25
-1
lines changed

pydax/loaders/_table.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic
3636
:param options:
3737
- ``columns`` key specifies the data type of each column. Each data type corresponds to a Pandas'
3838
supported dtype. If unspecified, then it is default.
39+
- ``delimiter`` key specifies the delimiter of the input CSV file.
3940
- ``encoding`` key specifies the encoding of the CSV file. Defaults to UTF-8.
4041
:raises TypeError: ``path`` is not a path object.
4142
"""
@@ -54,4 +55,6 @@ def load(self, path: Union[_typing.PathLike, Dict[str, str]], options: SchemaDic
5455
else:
5556
dtypes[column] = type_
5657

57-
return pd.read_csv(path, parse_dates=parse_dates, dtype=dtypes, encoding=options.get('encoding', 'utf-8'))
58+
return pd.read_csv(path, parse_dates=parse_dates, dtype=dtypes,
59+
encoding=options.get('encoding', 'utf-8'),
60+
delimiter=options.get('delimiter', ','))

tests/schemata/datasets.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ datasets:
7777
id: csv
7878
options:
7979
encoding: 'UTF-8'
80+
delimiter: ','
8081
columns:
8182
DATE: 'datetime'
8283
# Would have been int in pandas if unspecified. Put this here to have some dtype processing code ran more frequently in test code

tests/test_loaders.py

+20
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,26 @@ def test_csv_pandas_column_unsupported_data_types(self, tmp_path, noaa_jfk_schem
200200
for t in (err_column.dtype, err_column.check):
201201
assert re.search(rf"{t}(\d*|ing)\b", str(e.value)) # "ing" is for "str'ing'"
202202

203+
def test_csv_pandas_no_delimiter(self, tmp_path, noaa_jfk_schema):
204+
"Test when no delimiter is given."
205+
# Remove the delimiter option
206+
del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['delimiter']
207+
data = Dataset(noaa_jfk_schema, tmp_path,
208+
mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD).data['jfk_weather_cleaned']
209+
assert len(data.columns) == 16 # Number of columns remain the same
210+
211+
@pytest.mark.parametrize('delimiter', ('\t', ' ', '|', ';'))
212+
def test_csv_pandas_delimiter(self, tmp_path, noaa_jfk_schema, delimiter):
213+
"Test common delimiter settings. Note that the case of comma has been tested in ``test_csv_pandas_loader``."
214+
215+
del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['columns']
216+
# Change delimiter to tab, |, ;, space
217+
noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format']['options']['delimiter'] = delimiter
218+
data = Dataset(noaa_jfk_schema, tmp_path,
219+
mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD).data['jfk_weather_cleaned']
220+
# None of these delimiters exist in the file, number of columns should be 1
221+
assert len(data.columns) == 1
222+
203223
def test_csv_pandas_loader_no_path(self):
204224
"Test CSVPandasLoader when fed in with non-path."
205225

0 commit comments

Comments
 (0)