Skip to content

Commit 3d1325f

Browse files
mrocklinjorisvandenbossche
authored andcommitted
Fix missing data handling (geopandas#582)
Following on pandas-dev/pandas#17728 * Use None for missing values Previously we used `Empty Polygon` for missing values. Now we revert to using NULL in GeometryArray (as before) and Python None when we convert to shapely objects. This makes it so that only Nones and NaNs are considered missing.
1 parent d11f5f6 commit 3d1325f

File tree

8 files changed

+59
-24
lines changed

8 files changed

+59
-24
lines changed

geopandas/array.py

+15-6
Original file line numberDiff line numberDiff line change
@@ -818,14 +818,23 @@ def _fill(self, idx, value):
818818
""" Fill index locations with value
819819
820820
Value should be a BaseGeometry
821+
822+
Returns a copy
821823
"""
822-
if not (isinstance(value, BaseGeometry) or value is None):
824+
base = [self]
825+
if isinstance(value, BaseGeometry):
826+
base.append(value)
827+
value = value.__geom__
828+
elif value is None:
829+
value = 0
830+
else:
823831
raise TypeError(
824-
"Value should be either a BaseGeometry or None, got %s" % str(value)
832+
"Value should be either a BaseGeometry or None, " "got %s" % str(value)
825833
)
826-
# self.data[idx] = value
827-
self.data[idx] = np.array([value], dtype=object)
828-
return self
834+
835+
new = GeometryArray(self.data.copy(), base=base)
836+
new.data[idx] = value
837+
return new
829838

830839
def fillna(self, value=None, method=None, limit=None):
831840
""" Fill NA/NaN values using the specified method.
@@ -897,7 +906,7 @@ def isna(self):
897906
"""
898907
Boolean NumPy array indicating if each value is missing
899908
"""
900-
return np.array([g is None for g in self], dtype="bool")
909+
return self.data == 0
901910

902911
def unique(self):
903912
"""Compute the ExtensionArray of unique values.

geopandas/geoseries.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -345,15 +345,24 @@ def notnull(self):
345345
"""Alias for `notna` method. See `notna` for more detail."""
346346
return self.notna()
347347

348-
def fillna(self, value=None, method=None, inplace=False, **kwargs):
349-
"""Fill NA values with a geometry (empty polygon by default).
350-
351-
"method" is currently not implemented for pandas <= 0.12.
352-
"""
348+
def fillna(self, value=None):
349+
"""Fill NA/NaN values with a geometry (empty polygon by default)"""
353350
if value is None:
354351
value = BaseGeometry()
355-
return super(GeoSeries, self).fillna(
356-
value=value, method=method, inplace=inplace, **kwargs
352+
return GeoSeries(
353+
self.array.fillna(value), index=self.index, crs=self.crs, name=self.name
354+
)
355+
356+
def dropna(self):
357+
""" Drop NA/NaN values
358+
359+
Note: the inplace keyword is not currently supported.
360+
"""
361+
return GeoSeries(
362+
self.array[~self.isna()],
363+
index=self.index[~self.isna()],
364+
crs=self.crs,
365+
name=self.name,
357366
)
358367

359368
def __contains__(self, other):

geopandas/tests/test_geocode.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def test_prepare_result_none():
101101
assert "address" in df
102102

103103
row = df.loc["b"]
104-
assert len(row["geometry"].coords) == 0
104+
assert not row["geometry"]
105105
assert np.isnan(row["address"])
106106

107107

geopandas/tests/test_geoseries.py

-1
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,6 @@ def test_transform(self):
162162
def test_fillna(self):
163163
# default is to fill with empty geometry
164164
na = self.na_none.fillna()
165-
assert isinstance(na[2], BaseGeometry)
166165
assert na[2].is_empty
167166
assert geom_equals(self.na_none[:2], na[:2])
168167
# XXX: method works inconsistently for different pandas versions

geopandas/tests/test_pandas_methods.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -233,10 +233,11 @@ def test_select_dtypes(df):
233233
# Missing values
234234

235235

236-
def test_fillna(s):
236+
def test_fillna():
237237
s2 = GeoSeries([Point(0, 0), None, Point(2, 2)])
238238
res = s2.fillna(Point(1, 1))
239-
assert_geoseries_equal(res, s)
239+
expected = GeoSeries([Point(0, 0), Point(1, 1), Point(2, 2)])
240+
assert_geoseries_equal(res, expected)
240241

241242

242243
def test_dropna():

geopandas/tests/test_vectorized.py

+11
Original file line numberDiff line numberDiff line change
@@ -424,3 +424,14 @@ def test_unary_union():
424424
def test_coords():
425425
L = T.exterior.coords
426426
assert L == [tuple(t.exterior.coords) for t in triangles]
427+
428+
429+
def test_fill():
430+
p = shapely.geometry.Point(1, 2)
431+
P2 = P._fill([0, 3], p)
432+
assert P2[0].equals(p)
433+
assert P2[3].equals(p)
434+
with pytest.raises(TypeError) as info:
435+
P._fill([1, 2], 123)
436+
437+
assert "123" in str(info.value)

geopandas/tools/tests/test_sjoin.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,13 @@ def test_sjoin(op, lsuffix, rsuffix, how, missing):
6565
right_touched = set()
6666
for left_index, left_row in left.iterrows():
6767
for right_index, right_row in right.iterrows():
68-
if getattr(left_row["geometry"], op)(right_row["geometry"]):
68+
left_geom = left_row["geometry"]
69+
right_geom = right_row["geometry"]
70+
if (
71+
left_geom
72+
and right_geom
73+
and getattr(left_row["geometry"], op)(right_row["geometry"])
74+
):
6975
left_out.append(left_index)
7076
right_out.append(right_index)
7177

@@ -81,15 +87,15 @@ def test_sjoin(op, lsuffix, rsuffix, how, missing):
8187
L = list(result.geometry)
8288
for t in triangles2:
8389
if t:
84-
assert any(t.equals(t2) for t2 in L)
90+
assert any(t2 and t.equals(t2) for t2 in L)
8591

8692
if how == "right":
8793
assert len(result) >= len(right_out)
8894
assert set(result.columns) == set(columns + ["index_left"])
8995
L = list(result.geometry)
9096
for p in points2:
9197
if p:
92-
assert any(p.equals(p2) for p2 in L)
98+
assert any(p2 and p.equals(p2) for p2 in L)
9399

94100

95101
def test_crs_mismatch():

geopandas/vectorized.pyx

+4-4
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ cpdef get_element(np.ndarray[np.uintp_t, ndim=1, cast=True] geoms, int idx):
5959

6060
handle = get_geos_context_handle()
6161

62-
if not geom:
63-
geom = GEOSGeom_createEmptyPolygon_r(handle)
62+
if geom is NULL:
63+
return None
6464
else:
6565
geom = GEOSGeom_clone_r(handle, geom) # create a copy rather than deal with gc
6666

@@ -80,11 +80,11 @@ cpdef to_shapely(np.ndarray[np.uintp_t, ndim=1, cast=True] geoms):
8080
geom = <GEOSGeometry *> geoms[i]
8181

8282
if not geom:
83-
geom = GEOSGeom_createEmptyPolygon_r(handle)
83+
out[i] = None
8484
else:
8585
geom = GEOSGeom_clone_r(handle, geom) # create a copy rather than deal with gc
86+
out[i] = geom_factory(<np.uintp_t> geom)
8687

87-
out[i] = geom_factory(<np.uintp_t> geom)
8888

8989
return out
9090

0 commit comments

Comments
 (0)