Skip to content

Commit 4a9d882

Browse files
Added point in time support and the Search.iterate() method (#1833)
* Added point in time support and the Search.iterate() method * Update elasticsearch_dsl/_async/search.py Co-authored-by: Quentin Pradet <[email protected]> * feedback --------- Co-authored-by: Quentin Pradet <[email protected]>
1 parent 76a57fd commit 4a9d882

File tree

5 files changed

+145
-0
lines changed

5 files changed

+145
-0
lines changed

elasticsearch_dsl/_async/search.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18+
import contextlib
19+
1820
from elasticsearch.exceptions import ApiError
1921
from elasticsearch.helpers import async_scan
2022

@@ -92,6 +94,8 @@ async def scan(self):
9294
pass to the underlying ``scan`` helper from ``elasticsearch-py`` -
9395
https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan
9496
97+
The ``iterate()`` method should be preferred, as it provides similar
98+
functionality using an Elasticsearch point in time.
9599
"""
96100
es = get_connection(self._using)
97101

@@ -113,6 +117,45 @@ async def delete(self):
113117
)
114118
)
115119

120+
@contextlib.asynccontextmanager
121+
async def point_in_time(self, keep_alive="1m"):
122+
"""
123+
Open a point in time (pit) that can be used across several searches.
124+
125+
This method implements a context manager that returns a search object
126+
configured to operate within the created pit.
127+
128+
:arg keep_alive: the time to live for the point in time, renewed with each search request
129+
"""
130+
es = get_connection(self._using)
131+
132+
pit = await es.open_point_in_time(
133+
index=self._index or "*", keep_alive=keep_alive
134+
)
135+
search = self.index().extra(pit={"id": pit["id"], "keep_alive": keep_alive})
136+
if not search._sort:
137+
search = search.sort("_shard_doc")
138+
yield search
139+
await es.close_point_in_time(id=pit["id"])
140+
141+
async def iterate(self, keep_alive="1m"):
142+
"""
143+
Return a generator that iterates over all the documents matching the query.
144+
145+
This method uses a point in time to provide consistent results even when
146+
the index is changing. It should be preferred over ``scan()``.
147+
148+
:arg keep_alive: the time to live for the point in time, renewed with each new search request
149+
"""
150+
async with self.point_in_time(keep_alive=keep_alive) as s:
151+
while True:
152+
r = await s.execute()
153+
for hit in r:
154+
yield hit
155+
if len(r.hits) == 0:
156+
break
157+
s = r.search_after()
158+
116159

117160
class AsyncMultiSearch(MultiSearchBase):
118161
"""

elasticsearch_dsl/_sync/search.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18+
import contextlib
19+
1820
from elasticsearch.exceptions import ApiError
1921
from elasticsearch.helpers import scan
2022

@@ -88,6 +90,8 @@ def scan(self):
8890
pass to the underlying ``scan`` helper from ``elasticsearch-py`` -
8991
https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan
9092
93+
The ``iterate()`` method should be preferred, as it provides similar
94+
functionality using an Elasticsearch point in time.
9195
"""
9296
es = get_connection(self._using)
9397

@@ -105,6 +109,43 @@ def delete(self):
105109
es.delete_by_query(index=self._index, body=self.to_dict(), **self._params)
106110
)
107111

112+
@contextlib.contextmanager
113+
def point_in_time(self, keep_alive="1m"):
114+
"""
115+
Open a point in time (pit) that can be used across several searches.
116+
117+
This method implements a context manager that returns a search object
118+
configured to operate within the created pit.
119+
120+
:arg keep_alive: the time to live for the point in time, renewed with each search request
121+
"""
122+
es = get_connection(self._using)
123+
124+
pit = es.open_point_in_time(index=self._index or "*", keep_alive=keep_alive)
125+
search = self.index().extra(pit={"id": pit["id"], "keep_alive": keep_alive})
126+
if not search._sort:
127+
search = search.sort("_shard_doc")
128+
yield search
129+
es.close_point_in_time(id=pit["id"])
130+
131+
def iterate(self, keep_alive="1m"):
132+
"""
133+
Return a generator that iterates over all the documents matching the query.
134+
135+
This method uses a point in time to provide consistent results even when
136+
the index is changing. It should be preferred over ``scan()``.
137+
138+
:arg keep_alive: the time to live for the point in time, renewed with each new search request
139+
"""
140+
with self.point_in_time(keep_alive=keep_alive) as s:
141+
while True:
142+
r = s.execute()
143+
for hit in r:
144+
yield hit
145+
if len(r.hits) == 0:
146+
break
147+
s = r.search_after()
148+
108149

109150
class MultiSearch(MultiSearchBase):
110151
"""

tests/test_integration/_async/test_search.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,37 @@ async def test_search_after_no_results(async_data_client):
179179
await r.search_after()
180180

181181

182+
@pytest.mark.asyncio
183+
async def test_point_in_time(async_data_client):
184+
page_size = 7
185+
commits = []
186+
async with AsyncSearch(index="flat-git")[:page_size].point_in_time(
187+
keep_alive="30s"
188+
) as s:
189+
pit_id = s._extra["pit"]["id"]
190+
while True:
191+
r = await s.execute()
192+
commits += r.hits
193+
if len(r.hits) < page_size:
194+
break
195+
s = r.search_after()
196+
assert pit_id == s._extra["pit"]["id"]
197+
assert "30s" == s._extra["pit"]["keep_alive"]
198+
199+
assert 52 == len(commits)
200+
assert {d["_id"] for d in FLAT_DATA} == {c.meta.id for c in commits}
201+
202+
203+
@pytest.mark.asyncio
204+
async def test_iterate(async_data_client):
205+
s = AsyncSearch(index="flat-git")
206+
207+
commits = [commit async for commit in s.iterate()]
208+
209+
assert 52 == len(commits)
210+
assert {d["_id"] for d in FLAT_DATA} == {c.meta.id for c in commits}
211+
212+
182213
@pytest.mark.asyncio
183214
async def test_response_is_cached(async_data_client):
184215
s = Repository.search()

tests/test_integration/_sync/test_search.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,35 @@ def test_search_after_no_results(data_client):
171171
r.search_after()
172172

173173

174+
@pytest.mark.sync
175+
def test_point_in_time(data_client):
176+
page_size = 7
177+
commits = []
178+
with Search(index="flat-git")[:page_size].point_in_time(keep_alive="30s") as s:
179+
pit_id = s._extra["pit"]["id"]
180+
while True:
181+
r = s.execute()
182+
commits += r.hits
183+
if len(r.hits) < page_size:
184+
break
185+
s = r.search_after()
186+
assert pit_id == s._extra["pit"]["id"]
187+
assert "30s" == s._extra["pit"]["keep_alive"]
188+
189+
assert 52 == len(commits)
190+
assert {d["_id"] for d in FLAT_DATA} == {c.meta.id for c in commits}
191+
192+
193+
@pytest.mark.sync
194+
def test_iterate(data_client):
195+
s = Search(index="flat-git")
196+
197+
commits = [commit for commit in s.iterate()]
198+
199+
assert 52 == len(commits)
200+
assert {d["_id"] for d in FLAT_DATA} == {c.meta.id for c in commits}
201+
202+
174203
@pytest.mark.sync
175204
def test_response_is_cached(data_client):
176205
s = Repository.search()

utils/run-unasync.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def main(check=False):
7272
"async_sleep": "sleep",
7373
"assert_awaited_once_with": "assert_called_once_with",
7474
"pytest_asyncio": "pytest",
75+
"asynccontextmanager": "contextmanager",
7576
}
7677
rules = [
7778
unasync.Rule(

0 commit comments

Comments
 (0)