@@ -147,6 +147,85 @@ If the compression method cannot be inferred, use the ``compression`` argument:
147
147
(``mode `` being one of ``tarfile.open ``'s modes: https://docs.python.org/3/library/tarfile.html#tarfile.open)
148
148
149
149
150
+ .. _whatsnew_150.enhancements.read_xml_dtypes :
151
+
152
+ read_xml now supports ``dtype ``, ``converters ``, and ``parse_dates ``
153
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
154
+
155
+ Similar to other IO methods, :func: `pandas.read_xml ` now supports assigning specific dtypes to columns,
156
+ apply converter methods, and parse dates (:issue: `43567 `).
157
+
158
+ .. ipython :: python
159
+
160
+ xml_dates = """ <?xml version='1.0' encoding='utf-8'?>
161
+ <data>
162
+ <row>
163
+ <shape>square</shape>
164
+ <degrees>00360</degrees>
165
+ <sides>4.0</sides>
166
+ <date>2020-01-01</date>
167
+ </row>
168
+ <row>
169
+ <shape>circle</shape>
170
+ <degrees>00360</degrees>
171
+ <sides/>
172
+ <date>2021-01-01</date>
173
+ </row>
174
+ <row>
175
+ <shape>triangle</shape>
176
+ <degrees>00180</degrees>
177
+ <sides>3.0</sides>
178
+ <date>2022-01-01</date>
179
+ </row>
180
+ </data>"""
181
+
182
+ df = pd.read_xml(
183
+ xml_dates,
184
+ dtype = {' sides' : ' Int64' },
185
+ converters = {' degrees' : str },
186
+ parse_dates = [' date' ]
187
+ )
188
+ df
189
+ df.dtypes
190
+
191
+
192
+ .. _whatsnew_150.enhancements.read_xml_iterparse :
193
+
194
+ read_xml now supports large XML using ``iterparse ``
195
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
196
+
197
+ For very large XML files that can range in hundreds of megabytes to gigabytes, :func: `pandas.read_xml `
198
+ now supports parsing such sizeable files using `lxml's iterparse `_ and `etree's iterparse `_
199
+ which are memory-efficient methods to iterate through XML trees and extract specific elements
200
+ and attributes without holding entire tree in memory (:issue: `45442 `).
201
+
202
+ .. code-block :: ipython
203
+
204
+ In [1]: df = pd.read_xml(
205
+ ... "/path/to/downloaded/enwikisource-latest-pages-articles.xml",
206
+ ... iterparse = {"page": ["title", "ns", "id"]})
207
+ ... )
208
+ df
209
+ Out[2]:
210
+ title ns id
211
+ 0 Gettysburg Address 0 21450
212
+ 1 Main Page 0 42950
213
+ 2 Declaration by United Nations 0 8435
214
+ 3 Constitution of the United States of America 0 8435
215
+ 4 Declaration of Independence (Israel) 0 17858
216
+ ... ... ... ...
217
+ 3578760 Page:Black cat 1897 07 v2 n10.pdf/17 104 219649
218
+ 3578761 Page:Black cat 1897 07 v2 n10.pdf/43 104 219649
219
+ 3578762 Page:Black cat 1897 07 v2 n10.pdf/44 104 219649
220
+ 3578763 The History of Tom Jones, a Foundling/Book IX 0 12084291
221
+ 3578764 Page:Shakespeare of Stratford (1926) Yale.djvu/91 104 21450
222
+
223
+ [3578765 rows x 3 columns]
224
+
225
+
226
+ .. _`lxml's iterparse` : https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk
227
+ .. _`etree's iterparse` : https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse
228
+
150
229
.. _whatsnew_150.enhancements.other :
151
230
152
231
Other enhancements
@@ -294,83 +373,10 @@ upon serialization. (Related issue :issue:`12997`)
294
373
Backwards incompatible API changes
295
374
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
296
375
297
- .. _whatsnew_150.api_breaking.read_xml_dtypes :
298
-
299
- read_xml now supports ``dtype ``, ``converters ``, and ``parse_dates ``
300
- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
301
-
302
- Similar to other IO methods, :func: `pandas.read_xml ` now supports assigning specific dtypes to columns,
303
- apply converter methods, and parse dates (:issue: `43567 `).
304
-
305
- .. ipython :: python
306
-
307
- xml_dates = """ <?xml version='1.0' encoding='utf-8'?>
308
- <data>
309
- <row>
310
- <shape>square</shape>
311
- <degrees>00360</degrees>
312
- <sides>4.0</sides>
313
- <date>2020-01-01</date>
314
- </row>
315
- <row>
316
- <shape>circle</shape>
317
- <degrees>00360</degrees>
318
- <sides/>
319
- <date>2021-01-01</date>
320
- </row>
321
- <row>
322
- <shape>triangle</shape>
323
- <degrees>00180</degrees>
324
- <sides>3.0</sides>
325
- <date>2022-01-01</date>
326
- </row>
327
- </data>"""
376
+ .. _whatsnew_150.api_breaking.api_breaking1 :
328
377
329
- df = pd.read_xml(
330
- xml_dates,
331
- dtype = {' sides' : ' Int64' },
332
- converters = {' degrees' : str },
333
- parse_dates = [' date' ]
334
- )
335
- df
336
- df.dtypes
337
-
338
- .. _whatsnew_150.read_xml_iterparse :
339
-
340
- read_xml now supports large XML using ``iterparse ``
341
- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
342
-
343
- For very large XML files that can range in hundreds of megabytes to gigabytes, :func: `pandas.read_xml `
344
- now supports parsing such sizeable files using `lxml's iterparse `_ and `etree's iterparse `_
345
- which are memory-efficient methods to iterate through XML trees and extract specific elements
346
- and attributes without holding entire tree in memory (:issue: `#45442 `).
347
-
348
- .. code-block :: ipython
349
-
350
- In [1]: df = pd.read_xml(
351
- ... "/path/to/downloaded/enwikisource-latest-pages-articles.xml",
352
- ... iterparse = {"page": ["title", "ns", "id"]})
353
- ... )
354
- df
355
- Out[2]:
356
- title ns id
357
- 0 Gettysburg Address 0 21450
358
- 1 Main Page 0 42950
359
- 2 Declaration by United Nations 0 8435
360
- 3 Constitution of the United States of America 0 8435
361
- 4 Declaration of Independence (Israel) 0 17858
362
- ... ... ... ...
363
- 3578760 Page:Black cat 1897 07 v2 n10.pdf/17 104 219649
364
- 3578761 Page:Black cat 1897 07 v2 n10.pdf/43 104 219649
365
- 3578762 Page:Black cat 1897 07 v2 n10.pdf/44 104 219649
366
- 3578763 The History of Tom Jones, a Foundling/Book IX 0 12084291
367
- 3578764 Page:Shakespeare of Stratford (1926) Yale.djvu/91 104 21450
368
-
369
- [3578765 rows x 3 columns]
370
-
371
-
372
- .. _`lxml's iterparse` : https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk
373
- .. _`etree's iterparse` : https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse
378
+ api_breaking_change1
379
+ ^^^^^^^^^^^^^^^^^^^^
374
380
375
381
.. _whatsnew_150.api_breaking.api_breaking2 :
376
382
0 commit comments