Skip to content

Commit 1e1e1bf

Browse files
committed
ENH: Add support for writing variable labels
Add support for writing variable labels Fix documentation for to_stata Clean up function name to improve readability closes #13536 closes #13535
1 parent 3f6d4bd commit 1e1e1bf

File tree

4 files changed

+114
-21
lines changed

4 files changed

+114
-21
lines changed

doc/source/whatsnew/v0.19.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,8 @@ Other enhancements
250250
- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`)
251251
- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
252252

253+
- ``to_stata`` and ```StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13535`)
254+
253255
.. _whatsnew_0190.api:
254256

255257
API changes

pandas/core/frame.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -1467,7 +1467,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
14671467

14681468
def to_stata(self, fname, convert_dates=None, write_index=True,
14691469
encoding="latin-1", byteorder=None, time_stamp=None,
1470-
data_label=None):
1470+
data_label=None, variable_labels=None):
14711471
"""
14721472
A class for writing Stata binary dta files from array-like objects
14731473
@@ -1480,11 +1480,24 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
14801480
format that you want to use for the dates. Options are
14811481
'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
14821482
number or a name.
1483+
write_index : bool
1484+
Write the index to Stata dataset.
14831485
encoding : str
14841486
Default is latin-1. Note that Stata does not support unicode.
14851487
byteorder : str
14861488
Can be ">", "<", "little", or "big". The default is None which uses
14871489
`sys.byteorder`
1490+
time_stamp : datetime
1491+
A date time to use when writing the file. Can be None, in which
1492+
case the current time is used.
1493+
dataset_label : str
1494+
A label for the data set. Should be 80 characters or smaller.
1495+
1496+
.. versionadded:: 0.19.0
1497+
1498+
variable_labels : dict
1499+
Dictionary containing columns as keys and variable labels as
1500+
values. Each label must be 80 characters or smaller.
14881501
14891502
Examples
14901503
--------
@@ -1500,7 +1513,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
15001513
writer = StataWriter(fname, self, convert_dates=convert_dates,
15011514
encoding=encoding, byteorder=byteorder,
15021515
time_stamp=time_stamp, data_label=data_label,
1503-
write_index=write_index)
1516+
write_index=write_index,
1517+
variable_labels=variable_labels)
15041518
writer.write_file()
15051519

15061520
@Appender(fmt.docstring_to_string, indents=1)

pandas/io/stata.py

+38-13
Original file line numberDiff line numberDiff line change
@@ -1059,7 +1059,7 @@ def _read_new_header(self, first_char):
10591059
self.lbllist = self._get_lbllist()
10601060

10611061
self.path_or_buf.seek(self._seek_variable_labels)
1062-
self.vlblist = self._get_vlblist()
1062+
self._variable_labels = self._get_variable_labels()
10631063

10641064
# Get data type information, works for versions 117-118.
10651065
def _get_dtypes(self, seek_vartypes):
@@ -1127,7 +1127,7 @@ def _get_lbllist(self):
11271127
return [self._null_terminate(self.path_or_buf.read(b))
11281128
for i in range(self.nvar)]
11291129

1130-
def _get_vlblist(self):
1130+
def _get_variable_labels(self):
11311131
if self.format_version == 118:
11321132
vlblist = [self._decode(self.path_or_buf.read(321))
11331133
for i in range(self.nvar)]
@@ -1242,7 +1242,7 @@ def _read_old_header(self, first_char):
12421242

12431243
self.lbllist = self._get_lbllist()
12441244

1245-
self.vlblist = self._get_vlblist()
1245+
self._variable_labels = self._get_variable_labels()
12461246

12471247
# ignore expansion fields (Format 105 and later)
12481248
# When reading, read five bytes; the last four bytes now tell you
@@ -1306,11 +1306,11 @@ def _read_value_labels(self):
13061306
while True:
13071307
if self.format_version >= 117:
13081308
if self.path_or_buf.read(5) == b'</val': # <lbl>
1309-
break # end of variable label table
1309+
break # end of value label table
13101310

13111311
slength = self.path_or_buf.read(4)
13121312
if not slength:
1313-
break # end of variable label table (format < 117)
1313+
break # end of value label table (format < 117)
13141314
if self.format_version <= 117:
13151315
labname = self._null_terminate(self.path_or_buf.read(33))
13161316
else:
@@ -1666,7 +1666,7 @@ def variable_labels(self):
16661666
"""Returns variable labels as a dict, associating each variable name
16671667
with corresponding label
16681668
"""
1669-
return dict(zip(self.varlist, self.vlblist))
1669+
return dict(zip(self.varlist, self._variable_labels))
16701670

16711671
def value_labels(self):
16721672
"""Returns a dict, associating each variable name a dict, associating
@@ -1696,7 +1696,7 @@ def _set_endianness(endianness):
16961696

16971697
def _pad_bytes(name, length):
16981698
"""
1699-
Takes a char string and pads it wih null bytes until it's length chars
1699+
Takes a char string and pads it with null bytes until it's length chars
17001700
"""
17011701
return name + "\x00" * (length - len(name))
17021702

@@ -1831,6 +1831,12 @@ class StataWriter(StataParser):
18311831
dataset_label : str
18321832
A label for the data set. Should be 80 characters or smaller.
18331833
1834+
.. versionadded:: 0.19.0
1835+
1836+
variable_labels : dict
1837+
Dictionary containing columns as keys and variable labels as values.
1838+
Each label must be 80 characters or smaller.
1839+
18341840
Returns
18351841
-------
18361842
writer : StataWriter instance
@@ -1853,12 +1859,13 @@ class StataWriter(StataParser):
18531859

18541860
def __init__(self, fname, data, convert_dates=None, write_index=True,
18551861
encoding="latin-1", byteorder=None, time_stamp=None,
1856-
data_label=None):
1862+
data_label=None, variable_labels=None):
18571863
super(StataWriter, self).__init__(encoding)
18581864
self._convert_dates = convert_dates
18591865
self._write_index = write_index
18601866
self._time_stamp = time_stamp
18611867
self._data_label = data_label
1868+
self._variable_labels = variable_labels
18621869
# attach nobs, nvars, data, varlist, typlist
18631870
self._prepare_pandas(data)
18641871

@@ -2135,11 +2142,29 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None,
21352142
else: # Default is empty label
21362143
self._write(_pad_bytes("", 33))
21372144

2138-
def _write_variable_labels(self, labels=None):
2139-
nvar = self.nvar
2140-
if labels is None:
2141-
for i in range(nvar):
2142-
self._write(_pad_bytes("", 81))
2145+
def _write_variable_labels(self):
2146+
# Missing labels are 80 blank characters plus null termination
2147+
blank = _pad_bytes('', 81)
2148+
2149+
if self._variable_labels is None:
2150+
for i in range(self.nvar):
2151+
self._write(blank)
2152+
return
2153+
2154+
for col in self.data:
2155+
if col in self._variable_labels:
2156+
label = self._variable_labels[col]
2157+
if len(label) > 80:
2158+
raise ValueError('Variable labels must be 80 characters '
2159+
'or fewer')
2160+
is_latin1 = all(ord(c) < 256 for c in label)
2161+
if not is_latin1:
2162+
raise ValueError('Variable labels must contain only '
2163+
'characters that can be encoded in '
2164+
'Latin-1')
2165+
self._write(_pad_bytes(label, 81))
2166+
else:
2167+
self._write(blank)
21432168

21442169
def _prepare_data(self):
21452170
data = self.data

pandas/io/tests/test_stata.py

+58-6
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,27 @@
11
# -*- coding: utf-8 -*-
22
# pylint: disable=E1101
33

4-
from datetime import datetime
54
import datetime as dt
65
import os
7-
import warnings
8-
import nose
96
import struct
107
import sys
8+
import warnings
9+
from datetime import datetime
1110
from distutils.version import LooseVersion
1211

12+
import nose
1313
import numpy as np
1414

1515
import pandas as pd
16+
import pandas.util.testing as tm
17+
from pandas import compat
1618
from pandas.compat import iterkeys
1719
from pandas.core.frame import DataFrame, Series
1820
from pandas.types.common import is_categorical_dtype
21+
from pandas.tslib import NaT
1922
from pandas.io.parsers import read_csv
2023
from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
2124
PossiblePrecisionLoss, StataMissingValue)
22-
import pandas.util.testing as tm
23-
from pandas.tslib import NaT
24-
from pandas import compat
2525

2626

2727
class TestStata(tm.TestCase):
@@ -1113,6 +1113,58 @@ def test_read_chunks_columns(self):
11131113
tm.assert_frame_equal(from_frame, chunk, check_dtype=False)
11141114
pos += chunksize
11151115

1116+
def test_write_variable_labels(self):
1117+
# GH 13631, add support for writing variable labels
1118+
original = pd.DataFrame({'a': [1, 2, 3, 4],
1119+
'b': [1.0, 3.0, 27.0, 81.0],
1120+
'c': ['Atlanta', 'Birmingham',
1121+
'Cincinnati', 'Detroit']})
1122+
original.index.name = 'index'
1123+
variable_labels = {'a': 'City Rank', 'b': 'City Exponent', 'c': 'City'}
1124+
with tm.ensure_clean() as path:
1125+
original.to_stata(path, variable_labels=variable_labels)
1126+
with StataReader(path) as sr:
1127+
read_labels = sr.variable_labels()
1128+
expected_labels = {'index': '',
1129+
'a': 'City Rank',
1130+
'b': 'City Exponent',
1131+
'c': 'City'}
1132+
tm.assert_equal(read_labels, expected_labels)
1133+
1134+
variable_labels['index'] = 'The Index'
1135+
with tm.ensure_clean() as path:
1136+
original.to_stata(path, variable_labels=variable_labels)
1137+
with StataReader(path) as sr:
1138+
read_labels = sr.variable_labels()
1139+
tm.assert_equal(read_labels, variable_labels)
1140+
1141+
def test_write_variable_label_errors(self):
1142+
original = pd.DataFrame({'a': [1, 2, 3, 4],
1143+
'b': [1.0, 3.0, 27.0, 81.0],
1144+
'c': ['Atlanta', 'Birmingham',
1145+
'Cincinnati', 'Detroit']})
1146+
values = [u'\u03A1', u'\u0391',
1147+
u'\u039D', u'\u0394',
1148+
u'\u0391', u'\u03A3']
1149+
1150+
variable_labels_utf8 = {'a': 'City Rank',
1151+
'b': 'City Exponent',
1152+
'c': u''.join(values)}
1153+
1154+
with tm.assertRaises(ValueError):
1155+
with tm.ensure_clean() as path:
1156+
original.to_stata(path, variable_labels=variable_labels_utf8)
1157+
1158+
variable_labels_long = {'a': 'City Rank',
1159+
'b': 'City Exponent',
1160+
'c': 'A very, very, very long variable label '
1161+
'that is too long for Stata which means '
1162+
'that it has more than 80 characters'}
1163+
1164+
with tm.assertRaises(ValueError):
1165+
with tm.ensure_clean() as path:
1166+
original.to_stata(path, variable_labels=variable_labels_long)
1167+
11161168

11171169
if __name__ == '__main__':
11181170
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)