Skip to content

Commit 4b8580f

Browse files
elacuestaGallaecio
andauthored
Page event handlers (#28)
* Event handlers for Page objects * Rename meta key * Update readme * Update README.md Co-authored-by: Adrián Chaves <[email protected]> Co-authored-by: Adrián Chaves <[email protected]>
1 parent f3e5cfc commit 4b8580f

File tree

5 files changed

+204
-1
lines changed

5 files changed

+204
-1
lines changed

README.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,50 @@ Response, containing the final result.
271271
```
272272

273273

274+
## Page events
275+
A dictionary of Page event handlers can be specified in the `playwright_page_event_handlers`
276+
[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key.
277+
Keys are the name of the event to be handled (`dialog`, `download`, etc).
278+
Values can be either callables or strings (in which case a spider method with the name will be looked up).
279+
280+
Example:
281+
282+
```python
283+
from playwright.async_api import Dialog
284+
285+
async def handle_dialog(self, dialog: Dialog) -> None:
286+
logging.info(f"Handled dialog with message: {dialog.message}")
287+
await dialog.dismiss()
288+
289+
class EventSpider(scrapy.Spider):
290+
name = "event"
291+
292+
def start_requests(self):
293+
yield scrapy.Request(
294+
url="https://example.org",
295+
meta=dict(
296+
playwright=True,
297+
playwright_page_event_handlers={
298+
"dialog": handle_dialog,
299+
"response": "handle_response",
300+
},
301+
),
302+
)
303+
304+
async def handle_response(self, response: PlaywrightResponse) -> None:
305+
logging.info(f"Received response with URL {response.url}")
306+
```
307+
308+
See the [upstream `Page` docs](https://playwright.dev/python/docs/api/class-page/) for a list of
309+
the accepted events and the arguments passed to their handlers.
310+
311+
**Note**: keep in mind that, unless they are
312+
[removed later](https://playwright.dev/python/docs/events/#addingremoving-event-listener),
313+
these handlers will remain attached to the page and will be called for subsequent
314+
downloads using the same page. This is usually not a problem, since by default
315+
requests are performed in single-use pages.
316+
317+
274318
## Examples
275319

276320
**Click on a link, save the resulting page as PDF**

examples/events.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from playwright.async_api import Dialog, Response as PlaywrightResponse
2+
from scrapy import Spider, Request
3+
from scrapy.crawler import CrawlerProcess
4+
from scrapy_playwright.page import PageCoroutine
5+
6+
7+
class EventsSpider(Spider):
8+
"""
9+
Handle page events
10+
"""
11+
12+
name = "events"
13+
14+
def start_requests(self):
15+
yield Request(
16+
url="https://example.org",
17+
meta={
18+
"playwright": True,
19+
"playwright_page_coroutines": [
20+
PageCoroutine("evaluate", "alert('foobar');"),
21+
],
22+
"playwright_page_event_handlers": {
23+
"dialog": self.handle_dialog,
24+
"response": "handle_response",
25+
},
26+
},
27+
)
28+
29+
async def handle_dialog(self, dialog: Dialog) -> None:
30+
self.logger.info(f"Handled dialog with message: {dialog.message}")
31+
await dialog.dismiss()
32+
33+
async def handle_response(self, response: PlaywrightResponse) -> None:
34+
self.logger.info(f"Received response with URL {response.url}")
35+
36+
def parse(self, response):
37+
return {"url": response.url}
38+
39+
40+
if __name__ == "__main__":
41+
process = CrawlerProcess(
42+
settings={
43+
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
44+
"DOWNLOAD_HANDLERS": {
45+
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
46+
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
47+
},
48+
}
49+
)
50+
process.crawl(EventsSpider)
51+
process.start()

scrapy_playwright/handler.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,21 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
131131
page = request.meta.get("playwright_page")
132132
if not isinstance(page, Page):
133133
page = await self._create_page(request)
134+
135+
# attach event handlers
136+
event_handlers = request.meta.get("playwright_page_event_handlers") or {}
137+
for event, handler in event_handlers.items():
138+
if callable(handler):
139+
page.on(event, handler)
140+
elif isinstance(handler, str):
141+
try:
142+
page.on(event, getattr(spider, handler))
143+
except AttributeError:
144+
logger.warning(
145+
f"Spider '{spider.name}' does not have a '{handler}' attribute,"
146+
f" ignoring handler for event '{event}'"
147+
)
148+
134149
await page.unroute("**")
135150
await page.route(
136151
"**",

tests/test_playwright_requests.py

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1+
import logging
12
import platform
23
import subprocess
34
from tempfile import NamedTemporaryFile
45

56
import pytest
6-
from playwright.async_api import Page as PlaywrightPage, TimeoutError
7+
from playwright.async_api import Dialog, Page as PlaywrightPage, TimeoutError
78
from scrapy import Spider, Request, FormRequest
89
from scrapy.http.response.html import HtmlResponse
910
from scrapy.utils.test import get_crawler
@@ -22,6 +23,16 @@ def get_mimetype(file):
2223
).stdout.strip()
2324

2425

26+
class DialogSpider(Spider):
27+
"""A spider with a method to handle the "dialog" page event"""
28+
29+
name = "dialog"
30+
31+
async def handle_dialog(self, dialog: Dialog) -> None:
32+
self.dialog_message = dialog.message
33+
await dialog.dismiss()
34+
35+
2536
class MixinTestCase:
2637
@pytest.mark.asyncio
2738
async def test_basic_response(self):
@@ -238,6 +249,87 @@ async def test_page_coroutine_pdf(self):
238249

239250
await handler.browser.close()
240251

252+
@pytest.mark.asyncio
253+
async def test_event_handler_dialog_callable(self):
254+
crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
255+
handler = ScrapyPlaywrightDownloadHandler(crawler)
256+
await handler._launch_browser()
257+
258+
with StaticMockServer() as server:
259+
spider = DialogSpider()
260+
req = Request(
261+
url=server.urljoin("/index.html"),
262+
meta={
263+
"playwright": True,
264+
"playwright_page_coroutines": [
265+
PageCoro("evaluate", "alert('foobar');"),
266+
],
267+
"playwright_page_event_handlers": {
268+
"dialog": spider.handle_dialog,
269+
},
270+
},
271+
)
272+
await handler._download_request(req, spider)
273+
274+
assert spider.dialog_message == "foobar"
275+
276+
await handler.browser.close()
277+
278+
@pytest.mark.asyncio
279+
async def test_event_handler_dialog_str(self):
280+
crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
281+
handler = ScrapyPlaywrightDownloadHandler(crawler)
282+
await handler._launch_browser()
283+
284+
with StaticMockServer() as server:
285+
spider = DialogSpider()
286+
req = Request(
287+
url=server.urljoin("/index.html"),
288+
meta={
289+
"playwright": True,
290+
"playwright_page_coroutines": [
291+
PageCoro("evaluate", "alert('foobar');"),
292+
],
293+
"playwright_page_event_handlers": {
294+
"dialog": "handle_dialog",
295+
},
296+
},
297+
)
298+
await handler._download_request(req, spider)
299+
300+
assert spider.dialog_message == "foobar"
301+
302+
await handler.browser.close()
303+
304+
@pytest.mark.asyncio
305+
async def test_event_handler_dialog_missing(self, caplog):
306+
crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
307+
handler = ScrapyPlaywrightDownloadHandler(crawler)
308+
await handler._launch_browser()
309+
310+
with StaticMockServer() as server:
311+
spider = DialogSpider()
312+
req = Request(
313+
url=server.urljoin("/index.html"),
314+
meta={
315+
"playwright": True,
316+
"playwright_page_event_handlers": {
317+
"dialog": "missing_method",
318+
},
319+
},
320+
)
321+
await handler._download_request(req, spider)
322+
323+
assert (
324+
"scrapy-playwright",
325+
logging.WARNING,
326+
"Spider 'dialog' does not have a 'missing_method' attribute,"
327+
" ignoring handler for event 'dialog'",
328+
) in caplog.record_tuples
329+
assert getattr(spider, "dialog_message", None) is None
330+
331+
await handler.browser.close()
332+
241333

242334
class TestCaseChromium(MixinTestCase):
243335
browser_type = "chromium"

tox.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ deps =
1010
pytest-cov>=2.8
1111
pytest-twisted>=1.11
1212
commands =
13+
playwright install
1314
py.test --reactor=asyncio \
1415
--cov-report=term-missing --cov-report=html --cov-report=xml \
1516
--cov=scrapy_playwright {posargs: scrapy_playwright tests}

0 commit comments

Comments
 (0)