Skip to content

Commit bb68f3d

Browse files
committed
Mailgun: fix event/metadata param extraction in tracking webhook
Mailgun merges user-variables (metadata) into the webhook post data interspersed with the actual event params. This can lead to ambiguity interpreting post data. To extract metadata from an event, Anymail had been attempting to avoid that ambiguity by instead using X-Mailgun-Variables fields found in the event's message-headers param. But message-headers isn't included in some tracking events (opened, clicked, unsubscribed), resulting in empty metadata for those events. (#76) Also, conflicting metadata keys could confuse Anymail's Mailgun event parsing, leading to unexpected values in the normalized event. (#77) This commit: * Cleans up Anymail's tracking webhook to be explicit about which multi-value params it uses, avoiding conflicts with metadata keys. Fixes #77. * Extracts metadata from post params for opened, clicked and unsubscribed events. All unknown event params are assumed to be metadata. Fixes #76. * Documents a few metadata key names where it's impossible (or likely to be unreliable) for Anymail to extract metadata from the post data. For reference, the order of params in the Mailgun's post data *appears* to be (from live testing): * For the timestamp, token and signature params, any user-variable with the same name appears *before* the corresponding event data. * For all other params, any user-variable with the same name as a Mailgun event param appears *after* the Mailgun data.
1 parent 636c8a5 commit bb68f3d

File tree

5 files changed

+208
-34
lines changed

5 files changed

+208
-34
lines changed

anymail/utils.py

+28
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,34 @@ def collect_all_methods(cls, method_name):
398398
return methods
399399

400400

401+
def querydict_getfirst(qdict, field, default=UNSET):
402+
"""Like :func:`django.http.QueryDict.get`, but returns *first* value of multi-valued field.
403+
404+
>>> from django.http import QueryDict
405+
>>> q = QueryDict('a=1&a=2&a=3')
406+
>>> querydict_getfirst(q, 'a')
407+
'1'
408+
>>> q.get('a')
409+
'3'
410+
>>> q['a']
411+
'3'
412+
413+
You can bind this to a QueryDict instance using the "descriptor protocol":
414+
>>> q.getfirst = querydict_getfirst.__get__(q)
415+
>>> q.getfirst('a')
416+
'1'
417+
"""
418+
# (Why not instead define a QueryDict subclass with this method? Because there's no simple way
419+
# to efficiently initialize a QueryDict subclass with the contents of an existing instance.)
420+
values = qdict.getlist(field)
421+
if len(values) > 0:
422+
return values[0]
423+
elif default is not UNSET:
424+
return default
425+
else:
426+
return qdict[field] # raise appropriate KeyError
427+
428+
401429
EPOCH = datetime(1970, 1, 1, tzinfo=utc)
402430

403431

anymail/webhooks/mailgun.py

+84-31
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from .base import AnymailBaseWebhookView
1010
from ..exceptions import AnymailWebhookValidationFailure
1111
from ..signals import tracking, AnymailTrackingEvent, EventType, RejectReason
12-
from ..utils import get_anymail_setting, combine
12+
from ..utils import get_anymail_setting, combine, querydict_getfirst
1313

1414

1515
class MailgunBaseWebhookView(AnymailBaseWebhookView):
@@ -28,6 +28,8 @@ def __init__(self, **kwargs):
2828
def validate_request(self, request):
2929
super(MailgunBaseWebhookView, self).validate_request(request) # first check basic auth if enabled
3030
try:
31+
# Must use the *last* value of these fields if there are conflicting merged user-variables.
32+
# (Fortunately, Django QueryDict is specced to return the last value.)
3133
token = request.POST['token']
3234
timestamp = request.POST['timestamp']
3335
signature = str(request.POST['signature']) # force to same type as hexdigest() (for python2)
@@ -75,27 +77,31 @@ class MailgunTrackingWebhookView(MailgunBaseWebhookView):
7577

7678
def esp_to_anymail_event(self, esp_event):
7779
# esp_event is a Django QueryDict (from request.POST),
78-
# which has multi-valued fields, but is *not* case-insensitive
79-
80-
event_type = self.event_types.get(esp_event['event'], EventType.UNKNOWN)
81-
timestamp = datetime.fromtimestamp(int(esp_event['timestamp']), tz=utc)
80+
# which has multi-valued fields, but is *not* case-insensitive.
81+
# Because of the way Mailgun merges user-variables into the event,
82+
# we must generally use the *first* value of any multi-valued field
83+
# to avoid potential conflicting user-data.
84+
esp_event.getfirst = querydict_getfirst.__get__(esp_event)
85+
86+
event_type = self.event_types.get(esp_event.getfirst('event'), EventType.UNKNOWN)
87+
timestamp = datetime.fromtimestamp(int(esp_event['timestamp']), tz=utc) # use *last* value of timestamp
8288
# Message-Id is not documented for every event, but seems to always be included.
8389
# (It's sometimes spelled as 'message-id', lowercase, and missing the <angle-brackets>.)
84-
message_id = esp_event.get('Message-Id', esp_event.get('message-id', None))
90+
message_id = esp_event.getfirst('Message-Id', None) or esp_event.getfirst('message-id', None)
8591
if message_id and not message_id.startswith('<'):
8692
message_id = "<{}>".format(message_id)
8793

88-
description = esp_event.get('description', None)
89-
mta_response = esp_event.get('error', esp_event.get('notification', None))
94+
description = esp_event.getfirst('description', None)
95+
mta_response = esp_event.getfirst('error', None) or esp_event.getfirst('notification', None)
9096
reject_reason = None
9197
try:
92-
mta_status = int(esp_event['code'])
98+
mta_status = int(esp_event.getfirst('code'))
9399
except (KeyError, TypeError):
94100
pass
95101
except ValueError:
96102
# RFC-3463 extended SMTP status code (class.subject.detail, where class is "2", "4" or "5")
97103
try:
98-
status_class = esp_event['code'].split('.')[0]
104+
status_class = esp_event.getfirst('code').split('.')[0]
99105
except (TypeError, IndexError):
100106
# illegal SMTP status code format
101107
pass
@@ -107,37 +113,84 @@ def esp_to_anymail_event(self, esp_event):
107113
RejectReason.BOUNCED if 400 <= mta_status < 600
108114
else RejectReason.OTHER)
109115

110-
# Mailgun merges metadata fields with the other event fields.
111-
# However, it also includes the original message headers,
112-
# which have the metadata separately as X-Mailgun-Variables.
113-
try:
114-
headers = json.loads(esp_event['message-headers'])
115-
except (KeyError, ):
116-
metadata = {}
117-
else:
118-
variables = [value for [field, value] in headers
119-
if field == 'X-Mailgun-Variables']
120-
if len(variables) >= 1:
121-
# Each X-Mailgun-Variables value is JSON. Parse and merge them all into single dict:
122-
metadata = combine(*[json.loads(value) for value in variables])
123-
else:
124-
metadata = {}
116+
metadata = self._extract_metadata(esp_event)
125117

126-
# tags are sometimes delivered as X-Mailgun-Tag fields, sometimes as tag
127-
tags = esp_event.getlist('tag', esp_event.getlist('X-Mailgun-Tag', []))
118+
# tags are supposed to be in 'tag' fields, but are sometimes in undocumented X-Mailgun-Tag
119+
tags = esp_event.getlist('tag', None) or esp_event.getlist('X-Mailgun-Tag', [])
128120

129121
return AnymailTrackingEvent(
130122
event_type=event_type,
131123
timestamp=timestamp,
132124
message_id=message_id,
133-
event_id=esp_event.get('token', None),
134-
recipient=esp_event.get('recipient', None),
125+
event_id=esp_event.get('token', None), # use *last* value of token
126+
recipient=esp_event.getfirst('recipient', None),
135127
reject_reason=reject_reason,
136128
description=description,
137129
mta_response=mta_response,
138130
tags=tags,
139131
metadata=metadata,
140-
click_url=esp_event.get('url', None),
141-
user_agent=esp_event.get('user-agent', None),
132+
click_url=esp_event.getfirst('url', None),
133+
user_agent=esp_event.getfirst('user-agent', None),
142134
esp_event=esp_event,
143135
)
136+
137+
def _extract_metadata(self, esp_event):
138+
# Mailgun merges user-variables into the POST fields. If you know which user variable
139+
# you want to retrieve--and it doesn't conflict with a Mailgun event field--that's fine.
140+
# But if you want to extract all user-variables (like we do), it's more complicated...
141+
event_type = esp_event.getfirst('event')
142+
metadata = {}
143+
144+
if 'message-headers' in esp_event:
145+
# For events where original message headers are available, it's most reliable
146+
# to recover user-variables from the X-Mailgun-Variables header(s).
147+
headers = json.loads(esp_event['message-headers'])
148+
variables = [value for [field, value] in headers if field == 'X-Mailgun-Variables']
149+
if len(variables) >= 1:
150+
# Each X-Mailgun-Variables value is JSON. Parse and merge them all into single dict:
151+
metadata = combine(*[json.loads(value) for value in variables])
152+
153+
elif event_type in self._known_event_fields:
154+
# For other events, we must extract from the POST fields, ignoring known Mailgun
155+
# event parameters, and treating all other values as user-variables.
156+
known_fields = self._known_event_fields[event_type]
157+
for field, values in esp_event.lists():
158+
if field not in known_fields:
159+
# Unknown fields are assumed to be user-variables. (There should really only be
160+
# a single value, but just in case take the last one to match QueryDict semantics.)
161+
metadata[field] = values[-1]
162+
elif field == 'tag':
163+
# There's no way to distinguish a user-variable named 'tag' from an actual tag,
164+
# so don't treat this/these value(s) as metadata.
165+
pass
166+
elif len(values) == 1:
167+
# This is an expected event parameter, and since there's only a single value
168+
# it must be the event param, not metadata.
169+
pass
170+
else:
171+
# This is an expected event parameter, but there are (at least) two values.
172+
# One is the event param, and the other is a user-variable metadata value.
173+
# Which is which depends on the field:
174+
if field in {'signature', 'timestamp', 'token'}:
175+
metadata[field] = values[0] # values = [user-variable, event-param]
176+
else:
177+
metadata[field] = values[-1] # values = [event-param, user-variable]
178+
179+
return metadata
180+
181+
_common_event_fields = {
182+
# These fields are documented to appear in all Mailgun opened, clicked and unsubscribed events:
183+
'event', 'recipient', 'domain', 'ip', 'country', 'region', 'city', 'user-agent', 'device-type',
184+
'client-type', 'client-name', 'client-os', 'campaign-id', 'campaign-name', 'tag', 'mailing-list',
185+
'timestamp', 'token', 'signature',
186+
# Undocumented, but observed in actual events:
187+
'body-plain', 'h', 'message-id',
188+
}
189+
_known_event_fields = {
190+
# For all Mailgun event types that *don't* include message-headers,
191+
# map Mailgun (not normalized) event type to set of expected event fields.
192+
# Used for metadata extraction.
193+
'clicked': _common_event_fields | {'url'},
194+
'opened': _common_event_fields,
195+
'unsubscribed': _common_event_fields,
196+
}

docs/esps/mailgun.rst

+14
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,20 @@ values directly to Mailgun. You can use any of the (non-file) parameters listed
126126
.. _Mailgun sending docs: https://documentation.mailgun.com/api-sending.html#sending
127127

128128

129+
.. _mailgun-quirks:
130+
131+
Limitations and quirks
132+
----------------------
133+
134+
**Metadata keys and tracking webhooks**
135+
Because of the way Mailgun supplies custom data (user-variables) to webhooks,
136+
there are a few metadata keys that Anymail cannot reliably retrieve in some
137+
tracking events. You should avoid using "body-plain", "h", "message-headers",
138+
"message-id" or "tag" as :attr:`~anymail.message.AnymailMessage.metadata` keys
139+
if you need to access that metadata from an opened, clicked, or unsubscribed
140+
:ref:`tracking event <event-tracking>` handler.
141+
142+
129143
.. _mailgun-templates:
130144

131145
Batch sending/merge and ESP templates

tests/test_mailgun_webhooks.py

+65-2
Original file line numberDiff line numberDiff line change
@@ -225,21 +225,84 @@ def test_alt_smtp_code(self):
225225
self.assertEqual(event.reject_reason, "bounced")
226226
self.assertIn("RecipNotFound", event.mta_response)
227227

228-
def test_metadata(self):
228+
def test_metadata_message_headers(self):
229229
# Metadata fields are interspersed with other data, but also in message-headers
230+
# for delivered, bounced and dropped events
230231
raw_event = mailgun_sign({
231232
'event': 'delivered',
232233
'message-headers': json.dumps([
233234
["X-Mailgun-Variables", "{\"custom1\": \"value1\", \"custom2\": \"{\\\"key\\\":\\\"value\\\"}\"}"],
234235
]),
235-
'custom1': 'value',
236+
'custom1': 'value1',
236237
'custom2': '{"key":"value"}', # you can store JSON, but you'll need to unpack it yourself
237238
})
238239
self.client.post('/anymail/mailgun/tracking/', data=raw_event)
239240
kwargs = self.assert_handler_called_once_with(self.tracking_handler)
240241
event = kwargs['event']
241242
self.assertEqual(event.metadata, {"custom1": "value1", "custom2": '{"key":"value"}'})
242243

244+
def test_metadata_post_fields(self):
245+
# Metadata fields are only interspersed with other event params
246+
# for opened, clicked, unsubscribed events
247+
raw_event = mailgun_sign({
248+
'event': 'clicked',
249+
'custom1': 'value1',
250+
'custom2': '{"key":"value"}', # you can store JSON, but you'll need to unpack it yourself
251+
})
252+
self.client.post('/anymail/mailgun/tracking/', data=raw_event)
253+
kwargs = self.assert_handler_called_once_with(self.tracking_handler)
254+
event = kwargs['event']
255+
self.assertEqual(event.metadata, {"custom1": "value1", "custom2": '{"key":"value"}'})
256+
257+
def test_metadata_key_conflicts(self):
258+
# If you happen to name metadata (user-variable) keys the same as Mailgun
259+
# event properties, Mailgun will include both in the webhook post.
260+
# Make sure we don't confuse them.
261+
metadata = {
262+
"event": "metadata-event",
263+
"recipient": "metadata-recipient",
264+
"signature": "metadata-signature",
265+
"timestamp": "metadata-timestamp",
266+
"token": "metadata-token",
267+
"ordinary field": "ordinary metadata value",
268+
}
269+
270+
raw_event = mailgun_sign({
271+
'event': 'clicked',
272+
'recipient': '[email protected]',
273+
'token': 'actual-event-token',
274+
'timestamp': '1461261330',
275+
'url': 'http://clicked.example.com/actual/event/param',
276+
'h': "an (undocumented) Mailgun event param",
277+
'tag': ["actual-tag-1", "actual-tag-2"],
278+
})
279+
280+
# Simulate how Mailgun merges user-variables fields into event:
281+
for key in metadata.keys():
282+
if key in raw_event:
283+
if key in {'signature', 'timestamp', 'token'}:
284+
# For these fields, Mailgun's value appears after the metadata value
285+
raw_event[key] = [metadata[key], raw_event[key]]
286+
elif key == 'message-headers':
287+
pass # Mailgun won't merge this field into the event
288+
else:
289+
# For all other fields, the defined event value comes first
290+
raw_event[key] = [raw_event[key], metadata[key]]
291+
else:
292+
raw_event[key] = metadata[key]
293+
294+
response = self.client.post('/anymail/mailgun/tracking/', data=raw_event)
295+
self.assertEqual(response.status_code, 200) # if this fails, signature checking is using metadata values
296+
297+
kwargs = self.assert_handler_called_once_with(self.tracking_handler)
298+
event = kwargs['event']
299+
self.assertEqual(event.event_type, "clicked")
300+
self.assertEqual(event.recipient, "[email protected]")
301+
self.assertEqual(event.timestamp.isoformat(), "2016-04-21T17:55:30+00:00")
302+
self.assertEqual(event.event_id, "actual-event-token")
303+
self.assertEqual(event.tags, ["actual-tag-1", "actual-tag-2"])
304+
self.assertEqual(event.metadata, metadata)
305+
243306
def test_tags(self):
244307
# Most events include multiple 'tag' fields for message's tags
245308
raw_event = mailgun_sign({

tests/test_utils.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from unittest import skipIf
55

66
import six
7+
from django.http import QueryDict
78
from django.test import SimpleTestCase, RequestFactory, override_settings
89
from django.utils.translation import ugettext_lazy
910

@@ -22,7 +23,7 @@
2223
parse_address_list, EmailAddress,
2324
is_lazy, force_non_lazy, force_non_lazy_dict, force_non_lazy_list,
2425
update_deep,
25-
get_request_uri, get_request_basic_auth, parse_rfc2822date)
26+
get_request_uri, get_request_basic_auth, parse_rfc2822date, querydict_getfirst)
2627

2728

2829
class ParseAddressListTests(SimpleTestCase):
@@ -299,6 +300,21 @@ def test_get_request_uri_with_proxy(self):
299300
"https://user:[email protected]:8989/path/to/?query")
300301

301302

303+
class QueryDictUtilsTests(SimpleTestCase):
304+
def test_querydict_getfirst(self):
305+
q = QueryDict("a=one&a=two&a=three")
306+
q.getfirst = querydict_getfirst.__get__(q)
307+
self.assertEqual(q.getfirst('a'), "one")
308+
309+
# missing key exception:
310+
with self.assertRaisesMessage(KeyError, "not a key"):
311+
q.getfirst("not a key")
312+
313+
# defaults:
314+
self.assertEqual(q.getfirst('not a key', "beta"), "beta")
315+
self.assertIsNone(q.getfirst('not a key', None))
316+
317+
302318
class ParseRFC2822DateTests(SimpleTestCase):
303319
def test_with_timezones(self):
304320
dt = parse_rfc2822date("Tue, 24 Oct 2017 10:11:35 -0700")

0 commit comments

Comments
 (0)