Skip to content

PERF: leverage tzlocal package to provide 2000x speedup for dateutil.tz.tzlocal operations #24737

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions LICENSES/TZLOCAL_LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Copyright 2011-2017 Lennart Regebro

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
6 changes: 4 additions & 2 deletions asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def time_to_pydatetime(self, index_type):

class TzLocalize(object):

params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()]
params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc(),
dateutil.tz.tzlocal()]
param_names = 'tz'

def setup(self, tz):
Expand Down Expand Up @@ -394,7 +395,8 @@ def time_dup_string_tzoffset_dates(self, cache):

class DatetimeAccessor(object):

params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()]
params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc(),
dateutil.tz.tzlocal()]
param_names = 'tz'

def setup(self, tz):
Expand Down
Empty file added pandas/_libs/src/__init__.py
Empty file.
5 changes: 5 additions & 0 deletions pandas/_libs/src/tzlocal/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import sys
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't put this in src (which is not a real package, nor do we want it to be), rather just in pandas/_libs/tzlocal is fine

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alternative would be something like pandas/vendored/. pd.io.clipboard would also belong in such a directory.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

call this pandas/_vendored/ (explicity private)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make it clear that this is a vendored copy

if sys.platform == 'win32':
from pandas._libs.src.tzlocal.win32 import get_localzone, reload_localzone
else:
from pandas._libs.src.tzlocal.unix import get_localzone, reload_localzone
164 changes: 164 additions & 0 deletions pandas/_libs/src/tzlocal/unix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import os
import pytz
import re

from pandas._libs.src.tzlocal import utils

_cache_tz = None


def _tz_from_env(tzenv):
if tzenv[0] == ':':
tzenv = tzenv[1:]

# TZ specifies a file
if os.path.exists(tzenv):
with open(tzenv, 'rb') as tzfile:
return pytz.tzfile.build_tzinfo('local', tzfile)

# TZ specifies a zoneinfo zone.
try:
tz = pytz.timezone(tzenv)
# That worked, so we return this:
return tz
except pytz.UnknownTimeZoneError:
raise pytz.UnknownTimeZoneError(
"tzlocal() does not support non-zoneinfo timezones like %s. \n"
"Please use a timezone in the form of Continent/City")


def _try_tz_from_env():
tzenv = os.environ.get('TZ')
if tzenv:
try:
return _tz_from_env(tzenv)
except pytz.UnknownTimeZoneError:
pass


def _get_localzone(_root='/'):
"""Tries to find the local timezone configuration.

This method prefers finding the timezone name and passing that to pytz,
over passing in the localtime file, as in the later case the zoneinfo
name is unknown.

The parameter _root makes the function look for files like /etc/localtime
beneath the _root directory. This is primarily used by the tests.
In normal usage you call the function without parameters."""

tzenv = _try_tz_from_env()
if tzenv:
return tzenv

# Now look for distribution specific configuration files
# that contain the timezone name.
for configfile in ('etc/timezone', 'var/db/zoneinfo'):
tzpath = os.path.join(_root, configfile)
try:
with open(tzpath, 'rb') as tzfile:
data = tzfile.read()

# Issue #3 was that /etc/timezone was a zoneinfo file.
# That's a misconfiguration, but we need to handle it gracefully:
if data[:5] == b'TZif2':
continue

etctz = data.strip().decode()
if not etctz:
# Empty file, skip
continue
for etctz in data.decode().splitlines():
# Get rid of host definitions and comments:
if ' ' in etctz:
etctz, dummy = etctz.split(' ', 1)
if '#' in etctz:
etctz, dummy = etctz.split('#', 1)
if not etctz:
continue
return pytz.timezone(etctz.replace(' ', '_'))
except IOError:
# File doesn't exist or is a directory
continue

# CentOS has a ZONE setting in /etc/sysconfig/clock,
# OpenSUSE has a TIMEZONE setting in /etc/sysconfig/clock and
# Gentoo has a TIMEZONE setting in /etc/conf.d/clock
# We look through these files for a timezone:

zone_re = re.compile(r'\s*ZONE\s*=\s*\"')
timezone_re = re.compile(r'\s*TIMEZONE\s*=\s*\"')
end_re = re.compile('\"')

for filename in ('etc/sysconfig/clock', 'etc/conf.d/clock'):
tzpath = os.path.join(_root, filename)
try:
with open(tzpath, 'rt') as tzfile:
data = tzfile.readlines()

for line in data:
# Look for the ZONE= setting.
match = zone_re.match(line)
if match is None:
# No ZONE= setting. Look for the TIMEZONE= setting.
match = timezone_re.match(line)
if match is not None:
# Some setting existed
line = line[match.end():]
etctz = line[:end_re.search(line).start()]

# We found a timezone
return pytz.timezone(etctz.replace(' ', '_'))
except IOError:
# File doesn't exist or is a directory
continue

# systemd distributions use symlinks that include the zone name,
# see manpage of localtime(5) and timedatectl(1)
tzpath = os.path.join(_root, 'etc/localtime')
if os.path.exists(tzpath) and os.path.islink(tzpath):
tzpath = os.path.realpath(tzpath)
start = tzpath.find("/")+1
while start is not 0:
tzpath = tzpath[start:]
try:
return pytz.timezone(tzpath)
except pytz.UnknownTimeZoneError:
pass
start = tzpath.find("/")+1

# Are we under Termux on Android? It's not officially supported, because
# there is no reasonable way to run tests for this, but let's make an effort.
if os.path.exists('/system/bin/getprop'):
import subprocess
androidtz = subprocess.check_output(['getprop', 'persist.sys.timezone'])
return pytz.timezone(androidtz.strip().decode())

# No explicit setting existed. Use localtime
for filename in ('etc/localtime', 'usr/local/etc/localtime'):
tzpath = os.path.join(_root, filename)

if not os.path.exists(tzpath):
continue
with open(tzpath, 'rb') as tzfile:
return pytz.tzfile.build_tzinfo('local', tzfile)

raise pytz.UnknownTimeZoneError('Can not find any timezone configuration')


def get_localzone():
"""Get the computers configured local timezone, if any."""
global _cache_tz
if _cache_tz is None:
_cache_tz = _get_localzone()

utils.assert_tz_offset(_cache_tz)
return _cache_tz


def reload_localzone():
"""Reload the cached localzone. You need to call this if the timezone has changed."""
global _cache_tz
_cache_tz = _get_localzone()
utils.assert_tz_offset(_cache_tz)
return _cache_tz
38 changes: 38 additions & 0 deletions pandas/_libs/src/tzlocal/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
import datetime


def get_system_offset():
"""Get system's timezone offset using built-in library time.

For the Timezone constants (altzone, daylight, timezone, and tzname), the
value is determined by the timezone rules in effect at module load time or
the last time tzset() is called and may be incorrect for times in the past.

To keep compatibility with Windows, we're always importing time module here.
"""
import time
if time.daylight and time.localtime().tm_isdst > 0:
return -time.altzone
else:
return -time.timezone


def get_tz_offset(tz):
"""Get timezone's offset using built-in function datetime.utcoffset()."""
return int(datetime.datetime.now(tz).utcoffset().total_seconds())


def assert_tz_offset(tz):
"""Assert that system's timezone offset equals to the timezone offset found.

If they don't match, we probably have a misconfiguration, for example, an
incorrect timezone set in /etc/timezone file in systemd distributions."""
tz_offset = get_tz_offset(tz)
system_offset = get_system_offset()
if tz_offset != system_offset:
msg = ('Timezone offset does not match system offset: {0} != {1}. '
'Please, check your config files.').format(
tz_offset, system_offset
)
raise ValueError(msg)
104 changes: 104 additions & 0 deletions pandas/_libs/src/tzlocal/win32.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
try:
import _winreg as winreg
except ImportError:
import winreg

import pytz

from pandas._libs.src.tzlocal.windows_tz import win_tz
from pandas._libs.src.tzlocal import utils

_cache_tz = None


def valuestodict(key):
"""Convert a registry key's values to a dictionary."""
dict = {}
size = winreg.QueryInfoKey(key)[1]
for i in range(size):
data = winreg.EnumValue(key, i)
dict[data[0]] = data[1]
return dict


def get_localzone_name():
# Windows is special. It has unique time zone names (in several
# meanings of the word) available, but unfortunately, they can be
# translated to the language of the operating system, so we need to
# do a backwards lookup, by going through all time zones and see which
# one matches.
handle = winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE)

TZLOCALKEYNAME = r"SYSTEM\CurrentControlSet\Control\TimeZoneInformation"
localtz = winreg.OpenKey(handle, TZLOCALKEYNAME)
keyvalues = valuestodict(localtz)
localtz.Close()

if 'TimeZoneKeyName' in keyvalues:
# Windows 7 (and Vista?)

# For some reason this returns a string with loads of NUL bytes at
# least on some systems. I don't know if this is a bug somewhere, I
# just work around it.
tzkeyname = keyvalues['TimeZoneKeyName'].split('\x00', 1)[0]
else:
# Windows 2000 or XP

# This is the localized name:
tzwin = keyvalues['StandardName']

# Open the list of timezones to look up the real name:
TZKEYNAME = r"SOFTWARE\Microsoft\Windows NT\CurrentVersion\Time Zones"
tzkey = winreg.OpenKey(handle, TZKEYNAME)

# Now, match this value to Time Zone information
tzkeyname = None
for i in range(winreg.QueryInfoKey(tzkey)[0]):
subkey = winreg.EnumKey(tzkey, i)
sub = winreg.OpenKey(tzkey, subkey)
data = valuestodict(sub)
sub.Close()
try:
if data['Std'] == tzwin:
tzkeyname = subkey
break
except KeyError:
# This timezone didn't have proper configuration.
# Ignore it.
pass

tzkey.Close()
handle.Close()

if tzkeyname is None:
raise LookupError('Can not find Windows timezone configuration')

timezone = win_tz.get(tzkeyname)
if timezone is None:
# Nope, that didn't work. Try adding "Standard Time",
# it seems to work a lot of times:
timezone = win_tz.get(tzkeyname + " Standard Time")

# Return what we have.
if timezone is None:
raise pytz.UnknownTimeZoneError('Can not find timezone ' + tzkeyname)

return timezone


def get_localzone():
"""Returns the zoneinfo-based tzinfo object that matches the Windows-configured timezone."""
global _cache_tz
if _cache_tz is None:
_cache_tz = pytz.timezone(get_localzone_name())

utils.assert_tz_offset(_cache_tz)
return _cache_tz


def reload_localzone():
"""Reload the cached localzone. You need to call this if the timezone has changed."""
global _cache_tz
_cache_tz = pytz.timezone(get_localzone_name())
utils.assert_tz_offset(_cache_tz)
return _cache_tz
Loading