Skip to content

Server side analytics #4131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 7, 2018
3 changes: 3 additions & 0 deletions readthedocs/analytics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""App init"""

default_app_config = 'readthedocs.analytics.apps.AnalyticsAppConfig' # noqa
12 changes: 12 additions & 0 deletions readthedocs/analytics/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Django app config for the analytics app."""

from __future__ import absolute_import
from django.apps import AppConfig


class AnalyticsAppConfig(AppConfig):

"""Analytics app init code"""

name = 'readthedocs.analytics'
verbose_name = 'Analytics'
69 changes: 69 additions & 0 deletions readthedocs/analytics/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Tasks for Read the Docs' analytics"""

from __future__ import absolute_import

from django.conf import settings

from readthedocs import get_version
from readthedocs.worker import app

from .utils import send_to_analytics


DEFAULT_PARAMETERS = {
'v': '1', # analytics version (always 1)
'aip': '1', # anonymize IP
'tid': settings.GLOBAL_ANALYTICS_CODE,

# User data
'uip': None, # User IP address
'ua': None, # User agent

# Application info
'an': 'Read the Docs',
'av': get_version(), # App version
}


@app.task(queue='web')
def analytics_pageview(url, title=None, **kwargs):
"""
Send a pageview to Google Analytics

:see: https://developers.google.com/analytics/devguides/collection/protocol/v1/parameters
:param url: the URL of the pageview
:param title: the title of the page being viewed
:param kwargs: extra pageview parameters to send to GA
"""
data = {
't': 'pageview',
'dl': url, # URL of the pageview (required)
'dt': title, # Title of the page
}
data.update(DEFAULT_PARAMETERS)
data.update(kwargs)
send_to_analytics(data)


@app.task(queue='web')
def analytics_event(event_category, event_action, event_label=None, event_value=None, **kwargs):
"""
Send an analytics event to Google Analytics

:see: https://developers.google.com/analytics/devguides/collection/protocol/v1/devguide#event
:param event_category: the category of the event
:param event_action: the action of the event (use action words like "click")
:param event_label: an optional string to differentiate the event
:param event_value: an optional numeric value for the event
:param kwargs: extra event parameters to send to GA
"""
data = {
't': 'event', # GA event - don't change
'ec': event_category, # Event category (required)
'ea': event_action, # Event action (required)
'el': event_label, # Event label
'ev': event_value, # Event value (numeric)
}
data.update(DEFAULT_PARAMETERS)
data.update(kwargs)
send_to_analytics(data)
32 changes: 32 additions & 0 deletions readthedocs/analytics/tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from __future__ import absolute_import, unicode_literals

from django.test import TestCase

from .utils import anonymize_ip_address, anonymize_user_agent


class UtilsTests(TestCase):
def test_anonymize_ip(self):
self.assertEqual(anonymize_ip_address('127.0.0.1'), '127.0.0.0')
self.assertEqual(anonymize_ip_address('127.127.127.127'), '127.127.0.0')
self.assertEqual(
anonymize_ip_address('3ffe:1900:4545:3:200:f8ff:fe21:67cf'),
'3ffe:1900:4545:3:200:f8ff:fe21:0',
)
self.assertEqual(
anonymize_ip_address('fe80::200:f8ff:fe21:67cf'),
'fe80::200:f8ff:fe21:0',
)

def test_anonymize_ua(self):
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
self.assertEqual(
anonymize_user_agent(ua),
ua,
)

self.assertEqual(
anonymize_user_agent('Some rare user agent'),
'Rare user agent',
)

109 changes: 109 additions & 0 deletions readthedocs/analytics/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""Utilities related to analytics"""

from __future__ import absolute_import, unicode_literals
import hashlib
import logging

from django.conf import settings
from django.utils.encoding import force_text, force_bytes
from django.utils.crypto import get_random_string
import requests
from user_agents import parse

try:
# Python 3.3+ only
import ipaddress
except ImportError:
from .vendor import ipaddress

log = logging.getLogger(__name__) # noqa


def get_client_ip(request):
"""Gets the real IP based on a request object"""
ip_address = request.META.get('REMOTE_ADDR')

# Get the original IP address (eg. "X-Forwarded-For: client, proxy1, proxy2")
x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR', '').split(',')[0]
if x_forwarded_for:
ip_address = x_forwarded_for

return ip_address


def anonymize_ip_address(ip_address):
"""Anonymizes an IP address by zeroing the last 2 bytes"""
# Used to anonymize an IP by zero-ing out the last 2 bytes
ip_mask = int('0xFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000', 16)

try:
ip_obj = ipaddress.ip_address(force_text(ip_address))
except ValueError:
return None

anonymized_ip = ipaddress.ip_address(int(ip_obj) & ip_mask)
return anonymized_ip.compressed


def anonymize_user_agent(user_agent):
"""Anonymizes rare user agents"""
# If the browser family is not recognized, this is a rare user agent
parsed_ua = parse(user_agent)
if parsed_ua.browser.family == 'Other' or parsed_ua.os.family == 'Other':
return 'Rare user agent'

return user_agent


def send_to_analytics(data):
"""Sends data to Google Analytics"""
if data.get('uip') and data.get('ua'):
data['uid'] = generate_client_id(data['uip'], data['ua'])

if 'uip' in data:
# Anonymize IP address if applicable
data['uip'] = anonymize_ip_address(data['uip'])

if 'ua' in data:
# Anonymize user agent if it is rare
data['ua'] = anonymize_user_agent(data['ua'])

resp = None
log.debug('Sending data to analytics: %s', data)
try:
resp = requests.post(
'https://www.google-analytics.com/collect',
data=data,
timeout=3, # seconds
)
except requests.Timeout:
log.warning('Timeout sending to Google Analytics')

if resp and not resp.ok:
log.warning('Unknown error sending to Google Analytics')


def generate_client_id(ip_address, user_agent):
"""
Create an advertising ID

This simplifies things but essentially if a user has the same IP and same UA,
this will treat them as the same user for analytics purposes
"""
salt = b'advertising-client-id'

hash_id = hashlib.sha256()
hash_id.update(force_bytes(settings.SECRET_KEY))
hash_id.update(salt)
if ip_address:
hash_id.update(force_bytes(ip_address))
if user_agent:
hash_id.update(force_bytes(user_agent))

if not ip_address and not user_agent:
# Since no IP and no UA were specified,
# there's no way to distinguish sessions.
# Instead, just treat every user differently
hash_id.update(force_bytes(get_random_string()))

return hash_id.hexdigest()
Empty file.
Loading