From 1eee38b996df4bca3982bb3626715a418e427a79 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Thu, 9 May 2024 22:09:20 +0300 Subject: [PATCH 1/9] DOC: Add examples for pd.read_csv --- pandas/io/parsers/readers.py | 50 +++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 648e5108df77a..039e0651ea5c7 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -337,6 +337,8 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): - "mixed", to infer the format for each element individually. This is risky, and you should probably use it along with `dayfirst`. + You can specify the format for each column by passing a dict, + e.g. {"A": "%d/%m/%Y", "B": "%d/%m/%Y %H:%M"} .. versionadded:: 2.0.0 dayfirst : bool, default False DD/MM format dates, international and European format. @@ -485,7 +487,53 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): Examples -------- ->>> pd.{func_name}('data.csv') # doctest: +SKIP +>>> pd.read_csv('data.csv') # doctest: +SKIP + Name Value +0 foo 1 +1 bar 2 +2 #baz 3 + +Index and header can be specified via the `index_col` and `header` arguments + +>>> pd.read_csv('data.csv', header=None) # doctest: +SKIP + 0 1 +0 Name Value +1 foo 1 +2 bar 2 +3 #baz 3 + +>>> pd.read_csv('data.csv', index_col='Value') # doctest: +SKIP + Name +Value +1 foo +2 bar +3 #baz + +Column types are inferred but can be explicitly specified using the dtype argument + +>>> pd.read_csv('data.csv', dtype={{'Value': float}}) # doctest: +SKIP + Name Value +0 foo 1.0 +1 bar 2.0 +2 #baz 3.0 + +True, False, and NA values, and thousands separators have defaults, +but can be explicitly specified, too. Supply the values you would like +as strings or lists of strings! + +>>> pd.read_csv('data.csv', na_values=['foo', 'bar']) # doctest: +SKIP + Name Value +0 NaN 1 +1 NaN 2 +2 #baz 3 + +Comment lines in the CSV input file can be skipped using the +`comment` argument. + +>>> pd.read_csv('data.csv', comment='#') # doctest: +SKIP + Name Value +0 foo 1 +1 bar 2 """ ) From eb75d32d8149a7704948f564ecc24dd9ba37a71f Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Thu, 9 May 2024 22:28:22 +0300 Subject: [PATCH 2/9] Add double braces --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 039e0651ea5c7..31efaf6bb1338 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -338,7 +338,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): and you should probably use it along with `dayfirst`. You can specify the format for each column by passing a dict, - e.g. {"A": "%d/%m/%Y", "B": "%d/%m/%Y %H:%M"} + e.g. {{"A": "%d/%m/%Y", "B": "%d/%m/%Y %H:%M"}} .. versionadded:: 2.0.0 dayfirst : bool, default False DD/MM format dates, international and European format. From 189c5f4f60363003223a6f0c3c081d3446c62230 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Thu, 9 May 2024 22:58:01 +0300 Subject: [PATCH 3/9] fixes --- pandas/io/parsers/readers.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 31efaf6bb1338..5773b2ee56d1f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -337,8 +337,8 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): - "mixed", to infer the format for each element individually. This is risky, and you should probably use it along with `dayfirst`. - You can specify the format for each column by passing a dict, - e.g. {{"A": "%d/%m/%Y", "B": "%d/%m/%Y %H:%M"}} + You can specify the format for each column by passing a dict + (e.g. ``{{"A": "%d/%m/%Y", "B": "%d/%m/%Y %H:%M"}}``). .. versionadded:: 2.0.0 dayfirst : bool, default False DD/MM format dates, international and European format. @@ -487,7 +487,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): Examples -------- ->>> pd.read_csv('data.csv') # doctest: +SKIP +>>> pd.{func_name}('data.csv') # doctest: +SKIP Name Value 0 foo 1 1 bar 2 @@ -495,14 +495,14 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): Index and header can be specified via the `index_col` and `header` arguments ->>> pd.read_csv('data.csv', header=None) # doctest: +SKIP +>>> pd.{func_name}('data.csv', header=None) # doctest: +SKIP 0 1 0 Name Value 1 foo 1 2 bar 2 3 #baz 3 ->>> pd.read_csv('data.csv', index_col='Value') # doctest: +SKIP +>>> pd.{func_name}('data.csv', index_col='Value') # doctest: +SKIP Name Value 1 foo @@ -511,7 +511,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): Column types are inferred but can be explicitly specified using the dtype argument ->>> pd.read_csv('data.csv', dtype={{'Value': float}}) # doctest: +SKIP +>>> pd.{func_name}('data.csv', dtype={{'Value': float}}) # doctest: +SKIP Name Value 0 foo 1.0 1 bar 2.0 @@ -521,7 +521,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): but can be explicitly specified, too. Supply the values you would like as strings or lists of strings! ->>> pd.read_csv('data.csv', na_values=['foo', 'bar']) # doctest: +SKIP +>>> pd.{func_name}('data.csv', na_values=['foo', 'bar']) # doctest: +SKIP Name Value 0 NaN 1 1 NaN 2 @@ -530,7 +530,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): Comment lines in the CSV input file can be skipped using the `comment` argument. ->>> pd.read_csv('data.csv', comment='#') # doctest: +SKIP +>>> pd.{func_name}('data.csv', comment='#') # doctest: +SKIP Name Value 0 foo 1 1 bar 2 From 313a90819220dc489e993b5b701b50a5a47e1a65 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Fri, 10 May 2024 00:28:57 +0300 Subject: [PATCH 4/9] Add example for date_format --- pandas/io/parsers/readers.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 5773b2ee56d1f..13056059580c1 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -337,8 +337,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): - "mixed", to infer the format for each element individually. This is risky, and you should probably use it along with `dayfirst`. - You can specify the format for each column by passing a dict - (e.g. ``{{"A": "%d/%m/%Y", "B": "%d/%m/%Y %H:%M"}}``). .. versionadded:: 2.0.0 dayfirst : bool, default False DD/MM format dates, international and European format. @@ -527,13 +525,40 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): 1 NaN 2 2 #baz 3 -Comment lines in the CSV input file can be skipped using the -`comment` argument. +Comment lines in the input file can be skipped using the `comment` argument. >>> pd.{func_name}('data.csv', comment='#') # doctest: +SKIP Name Value 0 foo 1 1 bar 2 + +>>> df = pd.{func_name}('tmp.csv') # doctest: +SKIP + +>>> df # doctest: +SKIP + col 1 col 2 col 3 +0 10 10/04/2018 Sun 15 Jan 2023 +1 20 15/04/2018 Fri 12 May 2023 + +>>> df.dtypes # doctest: +SKIP +col 1 int64 +col 2 object +col 3 object +dtype: object + +Specific columns can be parsed as dates by using the `parse_dates` and +`date_format` arguments. + +>>> df = pd.read_csv( +... "tmp.csv", +... parse_dates=[1, 2], +... date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, +...) # doctest: +SKIP + +>>> df.dtypes # doctest: +SKIP +col 1 int64 +col 2 datetime64[ns] +col 3 datetime64[ns] +dtype: object """ ) From bd0488ef4f7e3028453a76e4629a3dfe60e9bee4 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Fri, 10 May 2024 00:30:39 +0300 Subject: [PATCH 5/9] Consistent use of single quotes --- pandas/io/parsers/readers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 13056059580c1..837028a3b0d63 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -548,10 +548,10 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): Specific columns can be parsed as dates by using the `parse_dates` and `date_format` arguments. ->>> df = pd.read_csv( -... "tmp.csv", +>>> df = pd.{func_name}( +... 'tmp.csv', ... parse_dates=[1, 2], -... date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, +... date_format={"col 2": '%d/%m/%Y', 'col 3': '%a %d %b %Y'}, ...) # doctest: +SKIP >>> df.dtypes # doctest: +SKIP From 41766cd70342fb1b902ec3b2bb16dfda7d6ce3eb Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Fri, 10 May 2024 00:31:25 +0300 Subject: [PATCH 6/9] I forgot --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 837028a3b0d63..ef4c42973943a 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -551,7 +551,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): >>> df = pd.{func_name}( ... 'tmp.csv', ... parse_dates=[1, 2], -... date_format={"col 2": '%d/%m/%Y', 'col 3': '%a %d %b %Y'}, +... date_format={'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}, ...) # doctest: +SKIP >>> df.dtypes # doctest: +SKIP From eb3c798c1b427acb94ea41392f6853ff4b377f34 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Fri, 10 May 2024 00:40:07 +0300 Subject: [PATCH 7/9] Add double braces, again.. --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index ef4c42973943a..9eb556559f6d8 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -551,7 +551,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): >>> df = pd.{func_name}( ... 'tmp.csv', ... parse_dates=[1, 2], -... date_format={'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}, +... date_format={{'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}}, ...) # doctest: +SKIP >>> df.dtypes # doctest: +SKIP From 8f8b5b91921054d4bd5314a7fbad0911ffc32271 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Fri, 10 May 2024 09:51:24 +0300 Subject: [PATCH 8/9] Space --- pandas/io/parsers/readers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 9eb556559f6d8..b7d59c510cd46 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -549,10 +549,10 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): `date_format` arguments. >>> df = pd.{func_name}( -... 'tmp.csv', -... parse_dates=[1, 2], -... date_format={{'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}}, -...) # doctest: +SKIP +... 'tmp.csv', +... parse_dates=[1, 2], +... date_format={{'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}}, +... ) # doctest: +SKIP >>> df.dtypes # doctest: +SKIP col 1 int64 From c52f57c5d9e3dbd9f5b24ab4b16a020dd171b599 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Fri, 10 May 2024 10:30:20 +0300 Subject: [PATCH 9/9] Add useful text --- pandas/io/parsers/readers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index b7d59c510cd46..24598701f1b67 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -491,7 +491,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): 1 bar 2 2 #baz 3 -Index and header can be specified via the `index_col` and `header` arguments +Index and header can be specified via the `index_col` and `header` arguments. >>> pd.{func_name}('data.csv', header=None) # doctest: +SKIP 0 1 @@ -507,7 +507,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): 2 bar 3 #baz -Column types are inferred but can be explicitly specified using the dtype argument +Column types are inferred but can be explicitly specified using the dtype argument. >>> pd.{func_name}('data.csv', dtype={{'Value': float}}) # doctest: +SKIP Name Value @@ -532,6 +532,8 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): 0 foo 1 1 bar 2 +By default, columns with dates will be read as ``object`` rather than ``datetime``. + >>> df = pd.{func_name}('tmp.csv') # doctest: +SKIP >>> df # doctest: +SKIP