Skip to content

Server side analytics #4131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 7, 2018
3 changes: 3 additions & 0 deletions readthedocs/analytics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""App init"""

default_app_config = 'readthedocs.analytics.apps.AnalyticsAppConfig' # noqa
12 changes: 12 additions & 0 deletions readthedocs/analytics/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Django app config for the analytics app."""

from __future__ import absolute_import
from django.apps import AppConfig


class AnalyticsAppConfig(AppConfig):

"""Analytics app init code"""

name = 'readthedocs.analytics'
verbose_name = 'Analytics'
63 changes: 63 additions & 0 deletions readthedocs/analytics/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Tasks for Read the Docs' analytics"""

from __future__ import absolute_import

from django.conf import settings

from readthedocs import get_version
from readthedocs.worker import app

from .utils import send_to_analytics


DEFAULT_PARAMETERS = {
'v': '1', # analytics version (always 1)
'aip': '1', # anonymize IP
'tid': settings.GLOBAL_ANALYTICS_CODE,

# User data
'uip': None, # User IP address
'ua': None, # User agent

# Application info
'an': 'Read the Docs',
'av': get_version(), # App version
}


@app.task(queue='web')
def analytics_pageview(pageview_data):
"""
Send a pageview to Google Analytics

:see: https://developers.google.com/analytics/devguides/collection/protocol/v1/parameters
:param pageview_data: pageview parameters to send to GA
"""
data = {
't': 'pageview',
'dl': None, # URL of the pageview (required)
'dt': None, # Title of the page
}
data.update(DEFAULT_PARAMETERS)
data.update(pageview_data)
send_to_analytics(data)


@app.task(queue='web')
def analytics_event(event_data):
"""
Send an analytics event to Google Analytics

:see: https://developers.google.com/analytics/devguides/collection/protocol/v1/devguide#event
:param event_data: event parameters to send to GA
"""
data = {
't': 'event', # GA event - don't change
'ec': None, # Event category (required)
'ea': None, # Event action (required)
'el': None, # Event label
'ev': None, # Event value (numeric)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we be setting these to None by default? It seems like we should be a bit more defensive here, and make sure the incoming event_data contains them.

}
data.update(DEFAULT_PARAMETERS)
data.update(event_data)
send_to_analytics(data)
32 changes: 32 additions & 0 deletions readthedocs/analytics/tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from __future__ import absolute_import, unicode_literals

from django.test import TestCase

from .utils import anonymize_ip_address, anonymize_user_agent


class UtilsTests(TestCase):
def test_anonymize_ip(self):
self.assertEqual(anonymize_ip_address('127.0.0.1'), '127.0.0.0')
self.assertEqual(anonymize_ip_address('127.127.127.127'), '127.127.0.0')
self.assertEqual(
anonymize_ip_address('3ffe:1900:4545:3:200:f8ff:fe21:67cf'),
'3ffe:1900:4545:3:200:f8ff:fe21:0',
)
self.assertEqual(
anonymize_ip_address('fe80::200:f8ff:fe21:67cf'),
'fe80::200:f8ff:fe21:0',
)

def test_anonymize_ua(self):
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
self.assertEqual(
anonymize_user_agent(ua),
ua,
)

self.assertEqual(
anonymize_user_agent('Some rare user agent'),
'Rare user agent',
)

107 changes: 107 additions & 0 deletions readthedocs/analytics/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""Utilities related to analytics"""

from __future__ import absolute_import, unicode_literals
import hashlib
import logging

from django.conf import settings
from django.utils.encoding import force_text, force_bytes
from django.utils.crypto import get_random_string
import requests
from user_agents import parse

try:
# Python 3.3+ only
import ipaddress
except ImportError:
from .vendor import ipaddress

log = logging.getLogger(__name__) # noqa

# Used to anonymize an IP by zero-ing out the last 2 bytes
MASK = int('0xFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000', 16)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this just live in the anonymize_ip_address function? Not sure if we need it elsewhere.



def get_client_ip(request):
"""Gets the real IP based on a request object"""
ip_address = request.META.get('REMOTE_ADDR')

# Get the original IP address (eg. "X-Forwarded-For: client, proxy1, proxy2")
x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR', '').split(',')[0]
if x_forwarded_for:
ip_address = x_forwarded_for

return ip_address


def anonymize_ip_address(ip_address):
"""Anonymizes an IP address by zeroing the last 2 bytes"""
try:
ip_obj = ipaddress.ip_address(force_text(ip_address))
except ValueError:
return None

anonymized_ip = ipaddress.ip_address(int(ip_obj) & MASK)
return anonymized_ip.compressed


def anonymize_user_agent(user_agent):
"""Anonymizes rare user agents"""
# If the browser family is not recognized, this is a rare user agent
parsed_ua = parse(user_agent)
if parsed_ua.browser.family == 'Other' or parsed_ua.os.family == 'Other':
return 'Rare user agent'

return user_agent


def send_to_analytics(data):
"""Sends data to Google Analytics"""
if data.get('uip') and data.get('ua'):
data['uid'] = generate_client_id(data['uip'], data['ua'])

if 'uip' in data:
# Anonymize IP address if applicable
data['uip'] = anonymize_ip_address(data['uip'])

if 'ua' in data:
# Anonymize user agent if it is rare
data['ua'] = anonymize_user_agent(data['ua'])

resp = None
try:
resp = requests.post(
'https://www.google-analytics.com/collect',
data=data,
)
except requests.Timeout:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the default timeout here? We should probably set it to something really low.

log.warning('Timeout sending to Google Analytics')

if resp and not resp.ok:
log.warning('Unknown error sending to Google Analytics')


def generate_client_id(ip_address, user_agent):
"""
Create an advertising ID

This simplifies things but essentially if a user has the same IP and same UA,
this will treat them as the same user for analytics purposes
"""
salt = b'advertising-client-id'

hash_id = hashlib.sha256()
hash_id.update(force_bytes(settings.SECRET_KEY))
hash_id.update(salt)
if ip_address:
hash_id.update(force_bytes(ip_address))
if user_agent:
hash_id.update(force_bytes(user_agent))

if not ip_address and not user_agent:
# Since no IP and no UA were specified,
# there's no way to distinguish sessions.
# Instead, just treat every user differently
hash_id.update(force_bytes(get_random_string()))

return hash_id.hexdigest()
Empty file.
Loading