-
-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Server side analytics #4131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Server side analytics #4131
Changes from 7 commits
27cc74b
edc96d3
9f796d4
dd320af
a1f8201
0e60520
30ee811
46ca5d7
2c09fda
e283d32
b425ce9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
"""App init""" | ||
|
||
default_app_config = 'readthedocs.analytics.apps.AnalyticsAppConfig' # noqa |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
"""Django app config for the analytics app.""" | ||
|
||
from __future__ import absolute_import | ||
from django.apps import AppConfig | ||
|
||
|
||
class AnalyticsAppConfig(AppConfig): | ||
|
||
"""Analytics app init code""" | ||
|
||
name = 'readthedocs.analytics' | ||
verbose_name = 'Analytics' |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
"""Tasks for Read the Docs' analytics""" | ||
|
||
from __future__ import absolute_import | ||
|
||
from django.conf import settings | ||
|
||
from readthedocs import get_version | ||
from readthedocs.worker import app | ||
|
||
from .utils import send_to_analytics | ||
|
||
|
||
DEFAULT_PARAMETERS = { | ||
'v': '1', # analytics version (always 1) | ||
'aip': '1', # anonymize IP | ||
'tid': settings.GLOBAL_ANALYTICS_CODE, | ||
|
||
# User data | ||
'uip': None, # User IP address | ||
'ua': None, # User agent | ||
|
||
# Application info | ||
'an': 'Read the Docs', | ||
'av': get_version(), # App version | ||
} | ||
|
||
|
||
@app.task(queue='web') | ||
def analytics_pageview(pageview_data): | ||
""" | ||
Send a pageview to Google Analytics | ||
|
||
:see: https://developers.google.com/analytics/devguides/collection/protocol/v1/parameters | ||
:param pageview_data: pageview parameters to send to GA | ||
""" | ||
data = { | ||
't': 'pageview', | ||
'dl': None, # URL of the pageview (required) | ||
'dt': None, # Title of the page | ||
} | ||
data.update(DEFAULT_PARAMETERS) | ||
data.update(pageview_data) | ||
send_to_analytics(data) | ||
|
||
|
||
@app.task(queue='web') | ||
def analytics_event(event_data): | ||
""" | ||
Send an analytics event to Google Analytics | ||
|
||
:see: https://developers.google.com/analytics/devguides/collection/protocol/v1/devguide#event | ||
:param event_data: event parameters to send to GA | ||
""" | ||
data = { | ||
't': 'event', # GA event - don't change | ||
'ec': None, # Event category (required) | ||
'ea': None, # Event action (required) | ||
'el': None, # Event label | ||
'ev': None, # Event value (numeric) | ||
} | ||
data.update(DEFAULT_PARAMETERS) | ||
data.update(event_data) | ||
send_to_analytics(data) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from __future__ import absolute_import, unicode_literals | ||
|
||
from django.test import TestCase | ||
|
||
from .utils import anonymize_ip_address, anonymize_user_agent | ||
|
||
|
||
class UtilsTests(TestCase): | ||
def test_anonymize_ip(self): | ||
self.assertEqual(anonymize_ip_address('127.0.0.1'), '127.0.0.0') | ||
self.assertEqual(anonymize_ip_address('127.127.127.127'), '127.127.0.0') | ||
self.assertEqual( | ||
anonymize_ip_address('3ffe:1900:4545:3:200:f8ff:fe21:67cf'), | ||
'3ffe:1900:4545:3:200:f8ff:fe21:0', | ||
) | ||
self.assertEqual( | ||
anonymize_ip_address('fe80::200:f8ff:fe21:67cf'), | ||
'fe80::200:f8ff:fe21:0', | ||
) | ||
|
||
def test_anonymize_ua(self): | ||
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' | ||
self.assertEqual( | ||
anonymize_user_agent(ua), | ||
ua, | ||
) | ||
|
||
self.assertEqual( | ||
anonymize_user_agent('Some rare user agent'), | ||
'Rare user agent', | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
"""Utilities related to analytics""" | ||
|
||
from __future__ import absolute_import, unicode_literals | ||
import hashlib | ||
import logging | ||
|
||
from django.conf import settings | ||
from django.utils.encoding import force_text, force_bytes | ||
from django.utils.crypto import get_random_string | ||
import requests | ||
from user_agents import parse | ||
|
||
try: | ||
# Python 3.3+ only | ||
import ipaddress | ||
except ImportError: | ||
from .vendor import ipaddress | ||
|
||
log = logging.getLogger(__name__) # noqa | ||
|
||
# Used to anonymize an IP by zero-ing out the last 2 bytes | ||
MASK = int('0xFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000', 16) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this just live in the |
||
|
||
|
||
def get_client_ip(request): | ||
"""Gets the real IP based on a request object""" | ||
ip_address = request.META.get('REMOTE_ADDR') | ||
|
||
# Get the original IP address (eg. "X-Forwarded-For: client, proxy1, proxy2") | ||
x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR', '').split(',')[0] | ||
if x_forwarded_for: | ||
ip_address = x_forwarded_for | ||
|
||
return ip_address | ||
|
||
|
||
def anonymize_ip_address(ip_address): | ||
"""Anonymizes an IP address by zeroing the last 2 bytes""" | ||
try: | ||
ip_obj = ipaddress.ip_address(force_text(ip_address)) | ||
except ValueError: | ||
return None | ||
|
||
anonymized_ip = ipaddress.ip_address(int(ip_obj) & MASK) | ||
return anonymized_ip.compressed | ||
|
||
|
||
def anonymize_user_agent(user_agent): | ||
"""Anonymizes rare user agents""" | ||
# If the browser family is not recognized, this is a rare user agent | ||
parsed_ua = parse(user_agent) | ||
if parsed_ua.browser.family == 'Other' or parsed_ua.os.family == 'Other': | ||
return 'Rare user agent' | ||
|
||
return user_agent | ||
|
||
|
||
def send_to_analytics(data): | ||
"""Sends data to Google Analytics""" | ||
if data.get('uip') and data.get('ua'): | ||
data['uid'] = generate_client_id(data['uip'], data['ua']) | ||
|
||
if 'uip' in data: | ||
# Anonymize IP address if applicable | ||
data['uip'] = anonymize_ip_address(data['uip']) | ||
|
||
if 'ua' in data: | ||
# Anonymize user agent if it is rare | ||
data['ua'] = anonymize_user_agent(data['ua']) | ||
|
||
resp = None | ||
try: | ||
resp = requests.post( | ||
'https://www.google-analytics.com/collect', | ||
data=data, | ||
) | ||
except requests.Timeout: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the default timeout here? We should probably set it to something really low. |
||
log.warning('Timeout sending to Google Analytics') | ||
|
||
if resp and not resp.ok: | ||
log.warning('Unknown error sending to Google Analytics') | ||
|
||
|
||
def generate_client_id(ip_address, user_agent): | ||
""" | ||
Create an advertising ID | ||
|
||
This simplifies things but essentially if a user has the same IP and same UA, | ||
this will treat them as the same user for analytics purposes | ||
""" | ||
salt = b'advertising-client-id' | ||
|
||
hash_id = hashlib.sha256() | ||
hash_id.update(force_bytes(settings.SECRET_KEY)) | ||
hash_id.update(salt) | ||
if ip_address: | ||
hash_id.update(force_bytes(ip_address)) | ||
if user_agent: | ||
hash_id.update(force_bytes(user_agent)) | ||
|
||
if not ip_address and not user_agent: | ||
# Since no IP and no UA were specified, | ||
# there's no way to distinguish sessions. | ||
# Instead, just treat every user differently | ||
hash_id.update(force_bytes(get_random_string())) | ||
|
||
return hash_id.hexdigest() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we be setting these to None by default? It seems like we should be a bit more defensive here, and make sure the incoming
event_data
contains them.