|
1 | 1 | .. _io:
|
2 |
| -x |
| 2 | + |
3 | 3 | .. currentmodule:: pandas
|
4 | 4 |
|
5 | 5 | .. ipython:: python
|
@@ -1130,7 +1130,7 @@ options:
|
1130 | 1130 |
|
1131 | 1131 | .. _io.bad_lines:
|
1132 | 1132 |
|
1133 |
| -Handling "bad" lines |
| 1133 | +Handling "bad" lines - excluding the data |
1134 | 1134 | ''''''''''''''''''''
|
1135 | 1135 |
|
1136 | 1136 | Some files may have malformed lines with too few fields or too many. Lines with
|
@@ -1175,6 +1175,80 @@ data that appear in some lines but not others:
|
1175 | 1175 | 0 1 2 3
|
1176 | 1176 | 1 4 5 6
|
1177 | 1177 | 2 8 9 10
|
| 1178 | + |
| 1179 | +Handling "bad" lines - preserving the data |
| 1180 | +'''''''''''''''''''' |
| 1181 | + |
| 1182 | +To preserve all data, you can specify header ``names`` that are long enough: |
| 1183 | + |
| 1184 | +.. code-block:: ipython |
| 1185 | +
|
| 1186 | + In [31]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd']) |
| 1187 | + |
| 1188 | + Out[31]: |
| 1189 | + a b c d |
| 1190 | + 0 1 2 3 NaN |
| 1191 | + 1 4 5 6 7 |
| 1192 | + 2 8 9 10 NaN |
| 1193 | + |
| 1194 | +or you can use Python's ``open`` command to detect the length of the widest row: |
| 1195 | + |
| 1196 | +.. code-block:: ipython |
| 1197 | +
|
| 1198 | + In [32]: |
| 1199 | + import csv |
| 1200 | + with open('data.csv', newline='') as f: |
| 1201 | + reader = csv.reader(f) |
| 1202 | + max_width = 0 |
| 1203 | + for row in reader: |
| 1204 | + length = row.count(',') |
| 1205 | + if length > max_width: |
| 1206 | + max_width = length |
| 1207 | +
|
| 1208 | +and then choose to edit the csv itself: |
| 1209 | + |
| 1210 | +.. code-block:: ipython |
| 1211 | +
|
| 1212 | + In [32] (cont'd): |
| 1213 | + |
| 1214 | + amended_rows = [] |
| 1215 | + for row in reader: |
| 1216 | + length = row.count(',') |
| 1217 | + if length < max_width: |
| 1218 | + for _ in range(max_width - length): |
| 1219 | + row = row + ',' |
| 1220 | + amended_rows.append(row) |
| 1221 | +
|
| 1222 | + writer = csv.writer(f) |
| 1223 | + writer.writerows(amended_rows) |
| 1224 | +
|
| 1225 | + pd.read_csv('data.csv') |
| 1226 | + |
| 1227 | + Out[32]: |
| 1228 | + a b c d |
| 1229 | + 0 1 2 3 NaN |
| 1230 | + 1 4 5 6 7 |
| 1231 | + 2 8 9 10 NaN |
| 1232 | + |
| 1233 | +or to specify ``names`` based on the length of the widest row: |
| 1234 | + |
| 1235 | +.. code-block:: ipython |
| 1236 | +
|
| 1237 | + In [32] (cont'd): |
| 1238 | + |
| 1239 | + label = 'c' |
| 1240 | + col_labels = [] |
| 1241 | + for col_num in range(max_width): |
| 1242 | + label = label + str(col_num) |
| 1243 | + col_labels.append(label) |
| 1244 | +
|
| 1245 | + pd.read_csv('data.csv', names=col_labels) |
| 1246 | + |
| 1247 | + Out[32]: |
| 1248 | + c1 c2 c3 c4 |
| 1249 | + 0 1 2 3 NaN |
| 1250 | + 1 4 5 6 7 |
| 1251 | + 2 8 9 10 NaN |
1178 | 1252 |
|
1179 | 1253 | .. _io.dialect:
|
1180 | 1254 |
|
|
0 commit comments