diff --git a/unit-1-reading-data-with-python-and-pandas/lesson-1-reading-csv-and-txt-files/files/Lecture.ipynb b/unit-1-reading-data-with-python-and-pandas/lesson-1-reading-csv-and-txt-files/files/Lecture.ipynb index 44de965..28a7870 100644 --- a/unit-1-reading-data-with-python-and-pandas/lesson-1-reading-csv-and-txt-files/files/Lecture.ipynb +++ b/unit-1-reading-data-with-python-and-pandas/lesson-1-reading-csv-and-txt-files/files/Lecture.ipynb @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -54,14 +54,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 68, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "<_io.TextIOWrapper name='btc-market-price.csv' mode='r' encoding='UTF-8'>\n" + "<_io.TextIOWrapper name='btc-market-price.csv' mode='r' encoding='cp1252'>\n" ] } ], @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 71, "metadata": {}, "outputs": [ { @@ -201,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 77, "metadata": {}, "outputs": [ { @@ -280,7 +280,7 @@ "4 Arab World ARB 1972 4.331606e+10" ] }, - "execution_count": 4, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } @@ -300,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 80, "metadata": { "scrolled": true }, @@ -369,7 +369,7 @@ "4 7/4/17 0:00 -" ] }, - "execution_count": 5, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } @@ -402,7 +402,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -412,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 86, "metadata": {}, "outputs": [ { @@ -479,7 +479,7 @@ "4 6/4/17 0:00 -" ] }, - "execution_count": 7, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } @@ -501,7 +501,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ @@ -512,7 +512,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 91, "metadata": {}, "outputs": [ { @@ -579,7 +579,7 @@ "4 6/4/17 0:00 NaN" ] }, - "execution_count": 9, + "execution_count": 91, "metadata": {}, "output_type": "execute_result" } @@ -601,7 +601,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -613,7 +613,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 96, "metadata": { "scrolled": true }, @@ -682,7 +682,7 @@ "4 6/4/17 0:00 NaN" ] }, - "execution_count": 11, + "execution_count": 96, "metadata": {}, "output_type": "execute_result" } @@ -707,7 +707,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ @@ -720,7 +720,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 101, "metadata": {}, "outputs": [ { @@ -787,7 +787,7 @@ "4 6/4/17 0:00 NaN" ] }, - "execution_count": 13, + "execution_count": 101, "metadata": {}, "output_type": "execute_result" } @@ -798,7 +798,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 103, "metadata": {}, "outputs": [ { @@ -809,7 +809,7 @@ "dtype: object" ] }, - "execution_count": 14, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } @@ -829,7 +829,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 132, "metadata": {}, "outputs": [ { @@ -843,27 +843,27 @@ "Name: Timestamp, dtype: datetime64[ns]" ] }, - "execution_count": 15, + "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pd.to_datetime(df['Timestamp']).head()" + "pd.to_datetime(df['Timestamp'], format='%d/%m/%Y').head()" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 141, "metadata": {}, "outputs": [], "source": [ - "df['Timestamp'] = pd.to_datetime(df['Timestamp'])" + "df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y')" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 143, "metadata": {}, "outputs": [ { @@ -930,7 +930,7 @@ "4 2017-06-04 NaN" ] }, - "execution_count": 17, + "execution_count": 143, "metadata": {}, "output_type": "execute_result" } @@ -941,7 +941,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 145, "metadata": {}, "outputs": [ { @@ -952,7 +952,7 @@ "dtype: object" ] }, - "execution_count": 18, + "execution_count": 145, "metadata": {}, "output_type": "execute_result" } @@ -974,23 +974,24 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "scrolled": false - }, + "execution_count": 166, + "metadata": {}, "outputs": [], "source": [ + "date_parser = lambda x: pd.to_datetime(x, format='%d/%m/%Y')\n", "df = pd.read_csv('btc-market-price.csv',\n", " header=None,\n", " na_values=['', '?', '-'],\n", " names=['Timestamp', 'Price'],\n", " dtype={'Price': 'float'},\n", - " parse_dates=[0])" + " parse_dates=[0],\n", + " date_format = date_parser\n", + ")" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 168, "metadata": {}, "outputs": [ { @@ -1021,27 +1022,27 @@ " \n", " \n", " 0\n", - " 2017-02-04\n", + " 2/4/17 0:00\n", " 1099.169125\n", " \n", " \n", " 1\n", - " 2017-03-04\n", + " 3/4/17 0:00\n", " 1141.813000\n", " \n", " \n", " 2\n", - " 2017-04-04\n", + " 4/4/17 0:00\n", " NaN\n", " \n", " \n", " 3\n", - " 2017-05-04\n", + " 5/4/17 0:00\n", " 1133.079314\n", " \n", " \n", " 4\n", - " 2017-06-04\n", + " 6/4/17 0:00\n", " NaN\n", " \n", " \n", @@ -1049,15 +1050,15 @@ "" ], "text/plain": [ - " Timestamp Price\n", - "0 2017-02-04 1099.169125\n", - "1 2017-03-04 1141.813000\n", - "2 2017-04-04 NaN\n", - "3 2017-05-04 1133.079314\n", - "4 2017-06-04 NaN" + " Timestamp Price\n", + "0 2/4/17 0:00 1099.169125\n", + "1 3/4/17 0:00 1141.813000\n", + "2 4/4/17 0:00 NaN\n", + "3 5/4/17 0:00 1133.079314\n", + "4 6/4/17 0:00 NaN" ] }, - "execution_count": 20, + "execution_count": 168, "metadata": {}, "output_type": "execute_result" } @@ -1068,18 +1069,18 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 170, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Timestamp datetime64[ns]\n", - "Price float64\n", + "Timestamp object\n", + "Price float64\n", "dtype: object" ] }, - "execution_count": 21, + "execution_count": 170, "metadata": {}, "output_type": "execute_result" } @@ -1103,22 +1104,24 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 181, "metadata": {}, "outputs": [], "source": [ + "date_parser = lambda x: pd.to_datetime(x, '%Y-%m-%d')\n", "df = pd.read_csv('btc-market-price.csv',\n", " header=None,\n", " na_values=['', '?', '-'],\n", " names=['Timestamp', 'Price'],\n", " dtype={'Price': 'float'},\n", " parse_dates=[0],\n", - " index_col=[0])" + " index_col=[0],\n", + " date_format = date_parser)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 183, "metadata": {}, "outputs": [ { @@ -1151,23 +1154,23 @@ " \n", " \n", " \n", - " 2017-02-04\n", + " 2/4/17 0:00\n", " 1099.169125\n", " \n", " \n", - " 2017-03-04\n", + " 3/4/17 0:00\n", " 1141.813000\n", " \n", " \n", - " 2017-04-04\n", + " 4/4/17 0:00\n", " NaN\n", " \n", " \n", - " 2017-05-04\n", + " 5/4/17 0:00\n", " 1133.079314\n", " \n", " \n", - " 2017-06-04\n", + " 6/4/17 0:00\n", " NaN\n", " \n", " \n", @@ -1175,16 +1178,16 @@ "" ], "text/plain": [ - " Price\n", - "Timestamp \n", - "2017-02-04 1099.169125\n", - "2017-03-04 1141.813000\n", - "2017-04-04 NaN\n", - "2017-05-04 1133.079314\n", - "2017-06-04 NaN" + " Price\n", + "Timestamp \n", + "2/4/17 0:00 1099.169125\n", + "3/4/17 0:00 1141.813000\n", + "4/4/17 0:00 NaN\n", + "5/4/17 0:00 1133.079314\n", + "6/4/17 0:00 NaN" ] }, - "execution_count": 5, + "execution_count": 183, "metadata": {}, "output_type": "execute_result" } @@ -1195,7 +1198,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 185, "metadata": {}, "outputs": [ { @@ -1205,7 +1208,7 @@ "dtype: object" ] }, - "execution_count": 24, + "execution_count": 185, "metadata": {}, "output_type": "execute_result" } @@ -1236,7 +1239,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 188, "metadata": {}, "outputs": [], "source": [ @@ -1245,7 +1248,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 190, "metadata": { "scrolled": true }, @@ -1314,7 +1317,7 @@ "Amy>Grimes>23>91>81 NaN NaN" ] }, - "execution_count": 7, + "execution_count": 190, "metadata": {}, "output_type": "execute_result" } @@ -1340,7 +1343,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 193, "metadata": {}, "outputs": [], "source": [ @@ -1350,7 +1353,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 195, "metadata": { "scrolled": true }, @@ -1437,7 +1440,7 @@ "4 Amy Grimes 23 91 81" ] }, - "execution_count": 9, + "execution_count": 195, "metadata": {}, "output_type": "execute_result" } @@ -1477,7 +1480,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 199, "metadata": {}, "outputs": [], "source": [ @@ -1487,7 +1490,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 201, "metadata": {}, "outputs": [ { @@ -1572,7 +1575,7 @@ "4 Amy Grimes 23 91 81" ] }, - "execution_count": 13, + "execution_count": 201, "metadata": {}, "output_type": "execute_result" } @@ -1583,7 +1586,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 203, "metadata": {}, "outputs": [ { @@ -1594,7 +1597,7 @@ "dtype: object" ] }, - "execution_count": 14, + "execution_count": 203, "metadata": {}, "output_type": "execute_result" } @@ -1612,7 +1615,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 206, "metadata": {}, "outputs": [], "source": [ @@ -1623,7 +1626,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 208, "metadata": {}, "outputs": [ { @@ -1708,7 +1711,7 @@ "4 Amy Grimes 23 91.0 81.0" ] }, - "execution_count": 16, + "execution_count": 208, "metadata": {}, "output_type": "execute_result" } @@ -1719,7 +1722,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 210, "metadata": {}, "outputs": [ { @@ -1730,7 +1733,7 @@ "dtype: object" ] }, - "execution_count": 34, + "execution_count": 210, "metadata": {}, "output_type": "execute_result" } @@ -1748,7 +1751,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 213, "metadata": {}, "outputs": [ { @@ -1833,7 +1836,7 @@ "4 Amy Grimes 23 91 81" ] }, - "execution_count": 35, + "execution_count": 213, "metadata": {}, "output_type": "execute_result" } @@ -1860,7 +1863,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 216, "metadata": {}, "outputs": [], "source": [ @@ -1871,7 +1874,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 218, "metadata": {}, "outputs": [ { @@ -1956,7 +1959,7 @@ "4 Amy Grimes 23 91.0 81.0" ] }, - "execution_count": 18, + "execution_count": 218, "metadata": {}, "output_type": "execute_result" } @@ -1974,7 +1977,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 221, "metadata": {}, "outputs": [ { @@ -2041,7 +2044,7 @@ "2 Amy Grimes 23 91 81" ] }, - "execution_count": 19, + "execution_count": 221, "metadata": {}, "output_type": "execute_result" } @@ -2061,7 +2064,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 224, "metadata": {}, "outputs": [], "source": [ @@ -2073,7 +2076,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 226, "metadata": {}, "outputs": [ { @@ -2140,7 +2143,7 @@ "2 Amy Grimes 23 91.0 81" ] }, - "execution_count": 21, + "execution_count": 226, "metadata": {}, "output_type": "execute_result" } @@ -2164,7 +2167,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 229, "metadata": {}, "outputs": [ { @@ -2258,7 +2261,7 @@ "5 Amy Grimes 23.0 91 81" ] }, - "execution_count": 22, + "execution_count": 229, "metadata": {}, "output_type": "execute_result" } @@ -2286,7 +2289,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 232, "metadata": {}, "outputs": [ { @@ -2359,7 +2362,7 @@ "4 Amy Grimes 23" ] }, - "execution_count": 42, + "execution_count": 232, "metadata": {}, "output_type": "execute_result" } @@ -2379,7 +2382,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 235, "metadata": {}, "outputs": [ { @@ -2452,7 +2455,7 @@ "4 Amy Grimes 23" ] }, - "execution_count": 43, + "execution_count": 235, "metadata": {}, "output_type": "execute_result" } @@ -2471,12 +2474,12 @@ "\n", "## Using a `Series` instead of `DataFrame`\n", "\n", - "If the parsed data only contains one column then we can return a Series by setting the `squeeze` parameter to `True`." + "The approach of returning a Series when the parsed data contains only one column has transitioned from using the squeeze=True parameter to utilizing the .squeeze() method." ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 383, "metadata": {}, "outputs": [], "source": [ @@ -2487,7 +2490,79 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 385, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_name
0Morley
1Scott
2Haley
3Mills
4Grimes
\n", + "
" + ], + "text/plain": [ + " last_name\n", + "0 Morley\n", + "1 Scott\n", + "2 Haley\n", + "3 Mills\n", + "4 Grimes" + ] + }, + "execution_count": 385, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exam_test_1" + ] + }, + { + "cell_type": "code", + "execution_count": 387, "metadata": {}, "outputs": [ { @@ -2496,7 +2571,7 @@ "pandas.core.frame.DataFrame" ] }, - "execution_count": 45, + "execution_count": 387, "metadata": {}, "output_type": "execute_result" } @@ -2505,21 +2580,53 @@ "type(exam_test_1)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Squeeze the CSV Data\n", + "https://pandas.pydata.org/pandas-docs/version/1.5.0/reference/api/pandas.read_csv.html\n" + ] + }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 390, "metadata": {}, "outputs": [], "source": [ "exam_test_2 = pd.read_csv('exam_review.csv',\n", " sep='>',\n", - " usecols=['last_name'],\n", - " squeeze=True)" + " usecols=['last_name']).squeeze() # the squeeze method converts the column to a series after reading the CSV file" + ] + }, + { + "cell_type": "code", + "execution_count": 392, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Morley\n", + "1 Scott\n", + "2 Haley\n", + "3 Mills\n", + "4 Grimes\n", + "Name: last_name, dtype: object" + ] + }, + "execution_count": 392, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exam_test_2" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 394, "metadata": {}, "outputs": [ { @@ -2528,7 +2635,7 @@ "pandas.core.series.Series" ] }, - "execution_count": 47, + "execution_count": 394, "metadata": {}, "output_type": "execute_result" } @@ -2550,7 +2657,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 397, "metadata": {}, "outputs": [ { @@ -2617,7 +2724,7 @@ "2 Amy Grimes 23 91.0 81" ] }, - "execution_count": 48, + "execution_count": 397, "metadata": {}, "output_type": "execute_result" } @@ -2635,16 +2742,16 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 400, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "',first_name,last_name,age,math_score,french_score\\n0,Melvin,Scott,24,77.0,83\\n1,Gerard,Mills,19,78.0,72\\n2,Amy,Grimes,23,91.0,81\\n'" + "',first_name,last_name,age,math_score,french_score\\r\\n0,Melvin,Scott,24,77.0,83\\r\\n1,Gerard,Mills,19,78.0,72\\r\\n2,Amy,Grimes,23,91.0,81\\r\\n'" ] }, - "execution_count": 49, + "execution_count": 400, "metadata": {}, "output_type": "execute_result" } @@ -2662,7 +2769,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 403, "metadata": {}, "outputs": [], "source": [ @@ -2671,7 +2778,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 405, "metadata": {}, "outputs": [ { @@ -2742,7 +2849,7 @@ "2 2 Amy Grimes 23 91.0 81" ] }, - "execution_count": 51, + "execution_count": 405, "metadata": {}, "output_type": "execute_result" } @@ -2753,7 +2860,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 407, "metadata": {}, "outputs": [], "source": [ @@ -2763,7 +2870,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 409, "metadata": { "scrolled": true }, @@ -2832,7 +2939,7 @@ "2 Amy Grimes 23 91.0 81" ] }, - "execution_count": 53, + "execution_count": 409, "metadata": {}, "output_type": "execute_result" } @@ -2851,7 +2958,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -2865,9 +2972,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.12.4" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/unit-1-reading-data-with-python-and-pandas/lesson-1-reading-csv-and-txt-files/files/out.csv b/unit-1-reading-data-with-python-and-pandas/lesson-1-reading-csv-and-txt-files/files/out.csv new file mode 100644 index 0000000..44a666c --- /dev/null +++ b/unit-1-reading-data-with-python-and-pandas/lesson-1-reading-csv-and-txt-files/files/out.csv @@ -0,0 +1,4 @@ +first_name,last_name,age,math_score,french_score +Melvin,Scott,24,77.0,83 +Gerard,Mills,19,78.0,72 +Amy,Grimes,23,91.0,81