Skip to content

Cleanup project tags #5983

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Aug 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions readthedocs/core/management/commands/import_github_tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Imports tags for projects without tags from GitHub."""
import time

from django.core.management.base import BaseCommand

from readthedocs.projects.models import Project
from readthedocs.projects.tag_utils import import_tags


class Command(BaseCommand):

help = __doc__

def handle(self, *args, **options):
queryset = Project.objects.filter(tags=None).filter(repo__contains='github.com')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this should live in a manager for github/gitlab/bitbucket and other providers we support. I've seen these types of queries scattered around a lot, and would be good to standardize on an approach. Not necessary in this PR though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tend to agree and I wouldn't mind expanding them to the other providers.

project_total = queryset.count()

for i, project in enumerate(queryset.iterator()):
self.stdout.write(
'[{}/{}] Fetching tags for {}'.format(i + 1, project_total, project.slug)
)

tags = import_tags(project)
if tags:
self.stdout.write('Set tags for {}: {}'.format(project.slug, tags))

# Sleeping half a second should keep us under 5k requests/hour
time.sleep(0.5)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Clever. 👍

20 changes: 20 additions & 0 deletions readthedocs/projects/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
ResourceUsageNotification,
)
from .tasks import remove_dirs
from .tag_utils import import_tags


class ProjectSendNotificationView(SendNotificationView):
Expand Down Expand Up @@ -155,6 +156,7 @@ class ProjectAdmin(admin.ModelAdmin):
'build_default_version',
'reindex_active_versions',
'wipe_all_versions',
'import_tags_from_vcs',
]

def feature_flags(self, obj):
Expand Down Expand Up @@ -294,6 +296,24 @@ def wipe_all_versions(self, request, queryset):

wipe_all_versions.short_description = 'Wipe all versions from ES'

def import_tags_from_vcs(self, request, queryset):
for project in queryset.iterator():
tags = import_tags(project)
if tags:
self.message_user(
request,
'Imported tags for {}: {}'.format(project, tags),
messages.SUCCESS
)
else:
self.message_user(
request,
'No tags found for {}'.format(project),
messages.WARNING
)

import_tags_from_vcs.short_description = 'Import tags from the version control API'

def get_actions(self, request):
actions = super().get_actions(request)
actions['delete_selected'] = (
Expand Down
86 changes: 86 additions & 0 deletions readthedocs/projects/management/commands/clean_tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
Cleanup project tags.

This specifically aims to cleanup:

- Differences only in lowercase/uppercase
- Slugify all tags
- Remove tags with no projects (old & spam mostly)

This command can probably be completely removed after being run.
Future tags should be canonicalized because of the new tag parser in
``readthedocs.projects.tag_utils.rtd_parse_tags``
"""

from django.core.management.base import BaseCommand
from taggit.utils import parse_tags, edit_string_for_tags

from readthedocs.projects.models import Project
from readthedocs.projects.tag_utils import remove_unused_tags


class Command(BaseCommand):

help = __doc__
dry_run = False

def reprocess_tags(self):
self.stdout.write('Reprocessing tags (lowercasing, slugifying, etc.)...')
project_total = Project.objects.count()

# Use an iterator so the queryset isn't stored in memory
# This may take a long time but should be memory efficient
for i, project in enumerate(Project.objects.iterator()):
old_tags_objs = list(project.tags.all())

if old_tags_objs:
old_tags = sorted([t.name for t in old_tags_objs])
old_tag_string = edit_string_for_tags(old_tags_objs)
new_tags = parse_tags(old_tag_string)

# Update the tags on the project if they are different
# Note: "parse_tags" handles sorting
if new_tags != old_tags:
if not self.dry_run:
self.stdout.write(
'[{}/{}] Setting tags on "{}"'.format(
i + 1,
project_total,
project.slug,
)
)
project.tags.set(*new_tags)
else:
self.stdout.write(
'[{}/{}] Not setting tags on "{}" (dry run)'.format(
i + 1,
project_total,
project.slug,
)
)

def remove_tags_with_no_projects(self):
if not self.dry_run:
self.stdout.write('Removing tags with no projects...')
num_deleted, _ = remove_unused_tags()
self.stdout.write('{} unused tags deleted'.format(num_deleted))

def add_arguments(self, parser):
parser.add_argument(
"--dry-run",
action="store_true",
help="Don't actually perform the actions, just print output",
)
parser.add_argument(
"--remove-unused-only",
action="store_true",
help="Don't canonicalize tags, just delete unused",
)

def handle(self, *args, **options):
self.dry_run = options["dry_run"]

if not options["remove_unused_only"]:
self.reprocess_tags()

self.remove_tags_with_no_projects()
84 changes: 84 additions & 0 deletions readthedocs/projects/tag_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Customizations to Django Taggit."""
from allauth.socialaccount.models import SocialApp
from django.db.models import Count
from django.utils.text import slugify
import requests
from taggit.models import Tag
from taggit.utils import _parse_tags

from .constants import GITHUB_REGEXS


def rtd_parse_tags(tag_string):
"""
Parses a string into its tags.

- Lowercases all tags
- Converts underscores to hyphens
- Slugifies tags

:see: https://django-taggit.readthedocs.io/page/custom_tagging.html
:param tag_string: a delimited string of tags
:return: a sorted list of tag strings
"""
if tag_string:
tag_string = tag_string.lower().replace('_', '-')

return sorted([slugify(tag) for tag in _parse_tags(tag_string)])


def remove_unused_tags():
"""Removes all tags that have no corresponding items (projects)."""
return Tag.objects.all().annotate(
num=Count('taggit_taggeditem_items')
).filter(num=0).delete()


def import_tags(project):
"""
Import tags using the version control API.

Currently, this is only implemented for github.
Uses the client ID and client secret for github otherwise the rate limit is 60/hr.
https://developer.github.com/v3/#rate-limiting

:returns: A list of the tags set or ``None`` on an error
"""
user = repo = ''
for regex in GITHUB_REGEXS:
match = regex.search(project.repo)
if match:
user, repo = match.groups()
break

if not user:
return None

provider = SocialApp.objects.filter(provider='github').first()
if not provider:
return None

# https://developer.github.com/v3/repos/#list-all-topics-for-a-repository
url = 'https://api.github.com/repos/{user}/{repo}/topics'.format(
user=user,
repo=repo,
)
headers = {
# Getting topics is a preview API and may change
# It requires this custom Accept header
'Accept': 'application/vnd.github.mercy-preview+json',
}
params = {
'client_id': provider.client_id,
'client_secret': provider.secret,
}

resp = requests.get(url, headers=headers, params=params)
if resp.ok:
tags = resp.json()['names']
if tags:
project.tags.set(*tags)
return tags
return []

return None
4 changes: 4 additions & 0 deletions readthedocs/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,10 @@ def USE_PROMOS(self): # noqa

INTERNAL_IPS = ('127.0.0.1',)

# Taggit
# https://django-taggit.readthedocs.io
TAGGIT_TAGS_FROM_STRING = 'readthedocs.projects.tag_utils.rtd_parse_tags'

# Stripe
STRIPE_SECRET = None
STRIPE_PUBLISHABLE = None
Expand Down