diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 7c2e488b98509..b8475b41a4950 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -103,7 +103,6 @@ MultiIndex I/O ^^^ - - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - - @@ -160,6 +159,14 @@ Other - - +I/O and LZMA +~~~~~~~~~~~~ + +Some users may unknowingly have an incomplete Python installation, which lacks the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue: `27575`). +Pandas will now warn, rather than raising an `ImportError` if the `lzma` module is not present. Any subsequent attempt to use `lzma` methods will raise a `RuntimeError`. +A possible fix for the lack of the `lzma` module is to ensure you have the necessary libraries and then re-install Python. +For example, on MacOS installing Python with `pyenv` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like `xz`). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python. + .. _whatsnew_0.251.contributors: Contributors diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index cafc31dad3568..6cc9dd22ce7c9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2,7 +2,6 @@ # See LICENSE for the license import bz2 import gzip -import lzma import os import sys import time @@ -59,9 +58,12 @@ from pandas.core.arrays import Categorical from pandas.core.dtypes.concat import union_categoricals import pandas.io.common as icom +from pandas.compat import _import_lzma, _get_lzma_file from pandas.errors import (ParserError, DtypeWarning, EmptyDataError, ParserWarning) +lzma = _import_lzma() + # Import CParserError as alias of ParserError for backwards compatibility. # Ultimately, we want to remove this import. See gh-12665 and gh-14479. CParserError = ParserError @@ -645,9 +647,9 @@ cdef class TextReader: 'zip file %s', str(zip_names)) elif self.compression == 'xz': if isinstance(source, str): - source = lzma.LZMAFile(source, 'rb') + source = _get_lzma_file(lzma)(source, 'rb') else: - source = lzma.LZMAFile(filename=source) + source = _get_lzma_file(lzma)(filename=source) else: raise ValueError('Unrecognized compression type: %s' % self.compression) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5ecd641fc68be..b32da8da3a1fb 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -10,6 +10,7 @@ import platform import struct import sys +import warnings PY35 = sys.version_info[:2] == (3, 5) PY36 = sys.version_info >= (3, 6) @@ -65,3 +66,32 @@ def is_platform_mac(): def is_platform_32bit(): return struct.calcsize("P") * 8 < 64 + + +def _import_lzma(): + """Attempts to import lzma, warning the user when lzma is not available. + """ + try: + import lzma + + return lzma + except ImportError: + msg = ( + "Could not import the lzma module. " + "Your installed Python is incomplete. " + "Attempting to use lzma compression will result in a RuntimeError." + ) + warnings.warn(msg) + + +def _get_lzma_file(lzma): + """Returns the lzma method LZMAFile when the module was correctly imported. + Otherwise, raises a RuntimeError. + """ + if lzma is None: + raise RuntimeError( + "lzma module not available. " + "A Python re-install with the proper " + "dependencies might be required to solve this issue." + ) + return lzma.LZMAFile diff --git a/pandas/io/common.py b/pandas/io/common.py index e01e473047b88..ac57cef372399 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -6,7 +6,6 @@ import gzip from http.client import HTTPException # noqa from io import BytesIO -import lzma import mmap import os import pathlib @@ -23,6 +22,7 @@ from urllib.request import pathname2url, urlopen import zipfile +from pandas.compat import _get_lzma_file, _import_lzma from pandas.errors import ( # noqa AbstractMethodError, DtypeWarning, @@ -35,6 +35,8 @@ from pandas._typing import FilePathOrBuffer +lzma = _import_lzma() + # gh-12665: Alias for now and remove later. CParserError = ParserError @@ -395,7 +397,7 @@ def _get_handle( # XZ Compression elif compression == "xz": - f = lzma.LZMAFile(path_or_buf, mode) + f = _get_lzma_file(lzma)(path_or_buf, mode) # Unrecognized Compression else: diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index ce459ab24afe0..16ca1109f266c 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,5 +1,7 @@ import contextlib import os +import subprocess +import textwrap import warnings import pytest @@ -125,3 +127,33 @@ def test_compression_warning(compression_only): with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): with f: df.to_csv(f, compression=compression_only) + + +def test_with_missing_lzma(): + """Tests if import pandas works when lzma is not present.""" + # https://github.com/pandas-dev/pandas/issues/27575 + code = textwrap.dedent( + """\ + import sys + sys.modules['lzma'] = None + import pandas + """ + ) + subprocess.check_output(["python", "-c", code]) + + +def test_with_missing_lzma_runtime(): + """Tests if RuntimeError is hit when calling lzma without + having the module available.""" + code = textwrap.dedent( + """ + import sys + import pytest + sys.modules['lzma'] = None + import pandas + df = pandas.DataFrame() + with pytest.raises(RuntimeError, match='lzma module'): + df.to_csv('foo.csv', compression='xz') + """ + ) + subprocess.check_output(["python", "-c", code]) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 076d0c9f947c7..30555508f0998 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -13,7 +13,6 @@ import bz2 import glob import gzip -import lzma import os import pickle import shutil @@ -22,7 +21,7 @@ import pytest -from pandas.compat import is_platform_little_endian +from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian import pandas as pd from pandas import Index @@ -30,6 +29,8 @@ from pandas.tseries.offsets import Day, MonthEnd +lzma = _import_lzma() + @pytest.fixture(scope="module") def current_pickle_data(): @@ -270,7 +271,7 @@ def compress_file(self, src_path, dest_path, compression): with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: f.write(src_path, os.path.basename(src_path)) elif compression == "xz": - f = lzma.LZMAFile(dest_path, "w") + f = _get_lzma_file(lzma)(dest_path, "w") else: msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index cf8452cdd0c59..a8f0d0da52e1f 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -5,7 +5,6 @@ from functools import wraps import gzip import http.client -import lzma import os import re from shutil import rmtree @@ -26,7 +25,7 @@ ) import pandas._libs.testing as _testing -from pandas.compat import raise_with_traceback +from pandas.compat import _get_lzma_file, _import_lzma, raise_with_traceback from pandas.core.dtypes.common import ( is_bool, @@ -70,6 +69,8 @@ from pandas.io.common import urlopen from pandas.io.formats.printing import pprint_thing +lzma = _import_lzma() + N = 30 K = 4 _RAISE_NETWORK_ERROR_DEFAULT = False @@ -211,7 +212,7 @@ def decompress_file(path, compression): elif compression == "bz2": f = bz2.BZ2File(path, "rb") elif compression == "xz": - f = lzma.LZMAFile(path, "rb") + f = _get_lzma_file(lzma)(path, "rb") elif compression == "zip": zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() @@ -264,9 +265,7 @@ def write_to_compressed(compression, path, data, dest="test"): compress_method = bz2.BZ2File elif compression == "xz": - import lzma - - compress_method = lzma.LZMAFile + compress_method = _get_lzma_file(lzma) else: msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg)