Skip to content

Commit f7de279

Browse files
mrocklinjorisvandenbossche
authored andcommitted
Fix missing data handling (geopandas#582)
Following on pandas-dev/pandas#17728 * Use None for missing values Previously we used `Empty Polygon` for missing values. Now we revert to using NULL in GeometryArray (as before) and Python None when we convert to shapely objects. This makes it so that only Nones and NaNs are considered missing.
1 parent 3acae8d commit f7de279

File tree

9 files changed

+68
-37
lines changed

9 files changed

+68
-37
lines changed

geopandas/array.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -462,12 +462,19 @@ def _fill(self, idx, value):
462462
463463
Returns a copy
464464
"""
465-
if not (isinstance(value, BaseGeometry) or value is None):
465+
base = [self]
466+
if isinstance(value, BaseGeometry):
467+
base.append(value)
468+
value = value.__geom__
469+
elif value is None:
470+
value = 0
471+
else:
466472
raise TypeError("Value should be either a BaseGeometry or None, "
467473
"got %s" % str(value))
468-
# self.data[idx] = value
469-
self.data[idx] = np.array([value], dtype=object)
470-
return self
474+
475+
new = GeometryArray(self.data.copy(), base=base)
476+
new.data[idx] = value
477+
return new
471478

472479
def fillna(self, value=None, method=None, limit=None):
473480
""" Fill NA/NaN values using the specified method.
@@ -826,7 +833,7 @@ def isna(self):
826833
"""
827834
Boolean NumPy array indicating if each value is missing
828835
"""
829-
return np.array([g is None for g in self], dtype='bool')
836+
return self.data == 0
830837

831838
def unique(self):
832839
"""Compute the ExtensionArray of unique values.

geopandas/geoseries.py

+16-14
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def __init__(self, *args, **kwargs):
9494
# by calling the Series init
9595
pass
9696

97-
def append(self, *args, **kwargs):
97+
def append(self, *args):
9898
return self._wrapped_pandas_method('append', *args, **kwargs)
9999

100100
@property
@@ -242,9 +242,8 @@ def isna(self):
242242
--------
243243
GeoSereies.notna : inverse of isna
244244
"""
245-
non_geo_null = super(GeoSeries, self).isnull()
246-
val = self.apply(_is_empty)
247-
return Series(np.logical_or(non_geo_null, val))
245+
return pd.Series(self.array.data == 0, index=self.index,
246+
name=self.name)
248247

249248
def isnull(self):
250249
"""Alias for `isna` method. See `isna` for more detail."""
@@ -271,21 +270,24 @@ def notnull(self):
271270
"""Alias for `notna` method. See `notna` for more detail."""
272271
return self.notna()
273272

274-
def fillna(self, value=None, method=None, inplace=False,
275-
**kwargs):
276-
"""Fill NA/NaN values with a geometry (empty polygon by default).
277-
278-
"method" is currently not implemented for pandas <= 0.12.
279-
"""
273+
def fillna(self, value=None):
274+
""" Fill NA/NaN values with a geometry (empty polygon by default) """
280275
if value is None:
281276
value = BaseGeometry()
282-
return super(GeoSeries, self).fillna(value=value, method=method,
283-
inplace=inplace, **kwargs)
277+
return GeoSeries(self.array.fillna(value), index=self.index,
278+
crs=self.crs, name=self.name)
279+
280+
def dropna(self):
281+
""" Drop NA/NaN values
282+
283+
Note: the inplace keyword is not currently supported.
284+
"""
285+
return GeoSeries(self.array[~self.isna()],
286+
index=self.index[~self.isna()],
287+
crs=self.crs, name=self.name)
284288

285289
def align(self, other, join='outer', level=None, copy=True,
286290
fill_value=None, **kwargs):
287-
if fill_value is None:
288-
fill_value = BaseGeometry()
289291
left, right = super(GeoSeries, self).align(other, join=join,
290292
level=level, copy=copy,
291293
fill_value=fill_value,

geopandas/io/file.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import math
2+
from numbers import Number
13
import os
24
from distutils.version import LooseVersion
35

geopandas/tests/test_geocode.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def test_prepare_result_none():
103103
assert 'address' in df
104104

105105
row = df.loc['b']
106-
assert len(row['geometry'].coords) == 0
106+
assert not row['geometry']
107107
assert np.isnan(row['address'])
108108

109109

geopandas/tests/test_geoseries.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@ def test_align(self):
101101
a1, a2 = self.a1.align(self.a2)
102102
assert isinstance(a1, GeoSeries)
103103
assert isinstance(a2, GeoSeries)
104-
assert a2['A'].is_empty
104+
assert a2['A'] is None
105105
assert a1['B'].equals(a2['B'])
106-
assert a1['C'].is_empty
106+
assert a1['C'] is None
107107

108108
def test_align_crs(self):
109109
a1 = self.a1
@@ -180,7 +180,6 @@ def test_transform(self):
180180
def test_fillna(self):
181181
# default is to fill with empty geometry
182182
na = self.na_none.fillna()
183-
assert isinstance(na[2], BaseGeometry)
184183
assert na[2].is_empty
185184
assert geom_equals(self.na_none[:2], na[:2])
186185
# XXX: method works inconsistently for different pandas versions
@@ -241,3 +240,14 @@ def test_construct_from_series():
241240
assert [a.equals(b) for a, b in zip(s, g)]
242241
assert s.name == g.name
243242
assert s.index is g.index
243+
244+
245+
def test_missing_values():
246+
s = GeoSeries([Point(1, 1), None, np.nan, BaseGeometry(), Polygon()])
247+
248+
assert s.isna().tolist() == [False, True, True, False, False]
249+
assert s.is_empty.tolist() == [False, False, False, True, True]
250+
assert not s.fillna().isna().any()
251+
assert len(s.fillna()) == 5
252+
assert not s.dropna().isna().any()
253+
assert len(s.dropna()) == 3

geopandas/tests/test_pandas_methods.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import pandas as pd
99
import shapely
1010
from shapely.geometry import Point, Polygon
11+
from shapely.geometry.base import BaseGeometry
1112

1213
from geopandas import GeoDataFrame, GeoSeries
1314
from geopandas.tests.util import assert_geoseries_equal
@@ -161,25 +162,21 @@ def test_select_dtypes(df):
161162
# Missing values
162163

163164

164-
@pytest.mark.xfail
165165
def test_fillna():
166-
# this currently does not work (it seems to fill in the second coordinate
167-
# of the point
168166
s2 = GeoSeries([Point(0, 0), None, Point(2, 2)])
169167
res = s2.fillna(Point(1, 1))
170-
assert_geoseries_equal(res, s)
168+
expected = GeoSeries([Point(0, 0), Point(1, 1), Point(2, 2)])
169+
assert_geoseries_equal(res, expected)
171170

172171

173-
@pytest.mark.xfail
174172
def test_dropna():
175-
# this currently does not work (doesn't drop)
176173
s2 = GeoSeries([Point(0, 0), None, Point(2, 2)])
177174
res = s2.dropna()
178175
exp = s2.loc[[0, 2]]
179176
assert_geoseries_equal(res, exp)
180177

181178

182-
@pytest.mark.parametrize("NA", [None, np.nan, Point(), Polygon()])
179+
@pytest.mark.parametrize("NA", [None, np.nan])
183180
def test_isna(NA):
184181
s2 = GeoSeries([Point(0, 0), NA, Point(2, 2)], index=[2, 4, 5], name='tt')
185182
exp = pd.Series([False, True, False], index=[2, 4, 5], name='tt')

geopandas/tests/test_vectorized.py

+11
Original file line numberDiff line numberDiff line change
@@ -400,3 +400,14 @@ def test_unary_union():
400400
def test_coords():
401401
L = T.exterior.coords
402402
assert L == [tuple(t.exterior.coords) for t in triangles]
403+
404+
405+
def test_fill():
406+
p = shapely.geometry.Point(1, 2)
407+
P2 = P._fill([0, 3], p)
408+
assert P2[0].equals(p)
409+
assert P2[3].equals(p)
410+
with pytest.raises(TypeError) as info:
411+
P._fill([1, 2], 123)
412+
413+
assert '123' in str(info.value)

geopandas/tools/tests/test_sjoin.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,9 @@ def test_sjoin(op, lsuffix, rsuffix, how, missing):
5959
right_touched = set()
6060
for left_index, left_row in left.iterrows():
6161
for right_index, right_row in right.iterrows():
62-
if getattr(left_row['geometry'], op)(right_row['geometry']):
62+
left_geom = left_row['geometry']
63+
right_geom = right_row['geometry']
64+
if left_geom and right_geom and getattr(left_row['geometry'], op)(right_row['geometry']):
6365
left_out.append(left_index)
6466
right_out.append(right_index)
6567

@@ -75,15 +77,15 @@ def test_sjoin(op, lsuffix, rsuffix, how, missing):
7577
L = list(result.geometry)
7678
for t in triangles2:
7779
if t:
78-
assert any(t.equals(t2) for t2 in L)
80+
assert any(t2 and t.equals(t2) for t2 in L)
7981

8082
if how == 'right':
8183
assert len(result) >= len(right_out)
8284
assert set(result.columns) == set(columns + ['index_left'])
8385
L = list(result.geometry)
8486
for p in points2:
8587
if p:
86-
assert any(p.equals(p2) for p2 in L)
88+
assert any(p2 and p.equals(p2) for p2 in L)
8789

8890

8991
def test_crs_mismatch():

geopandas/vectorized.pyx

+4-4
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ cpdef get_element(np.ndarray[np.uintp_t, ndim=1, cast=True] geoms, int idx):
5959

6060
handle = get_geos_context_handle()
6161

62-
if not geom:
63-
geom = GEOSGeom_createEmptyPolygon_r(handle)
62+
if geom is NULL:
63+
return None
6464
else:
6565
geom = GEOSGeom_clone_r(handle, geom) # create a copy rather than deal with gc
6666

@@ -80,11 +80,11 @@ cpdef to_shapely(np.ndarray[np.uintp_t, ndim=1, cast=True] geoms):
8080
geom = <GEOSGeometry *> geoms[i]
8181

8282
if not geom:
83-
geom = GEOSGeom_createEmptyPolygon_r(handle)
83+
out[i] = None
8484
else:
8585
geom = GEOSGeom_clone_r(handle, geom) # create a copy rather than deal with gc
86+
out[i] = geom_factory(<np.uintp_t> geom)
8687

87-
out[i] = geom_factory(<np.uintp_t> geom)
8888

8989
return out
9090

0 commit comments

Comments
 (0)