Skip to content

Commit 28a8303

Browse files
authored
Merge pull request #5983 from readthedocs/davidfischer/tags-cleanup
Cleanup project tags
2 parents c051b16 + a3e47b5 commit 28a8303

File tree

5 files changed

+222
-0
lines changed

5 files changed

+222
-0
lines changed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""Imports tags for projects without tags from GitHub."""
2+
import time
3+
4+
from django.core.management.base import BaseCommand
5+
6+
from readthedocs.projects.models import Project
7+
from readthedocs.projects.tag_utils import import_tags
8+
9+
10+
class Command(BaseCommand):
11+
12+
help = __doc__
13+
14+
def handle(self, *args, **options):
15+
queryset = Project.objects.filter(tags=None).filter(repo__contains='github.com')
16+
project_total = queryset.count()
17+
18+
for i, project in enumerate(queryset.iterator()):
19+
self.stdout.write(
20+
'[{}/{}] Fetching tags for {}'.format(i + 1, project_total, project.slug)
21+
)
22+
23+
tags = import_tags(project)
24+
if tags:
25+
self.stdout.write('Set tags for {}: {}'.format(project.slug, tags))
26+
27+
# Sleeping half a second should keep us under 5k requests/hour
28+
time.sleep(0.5)

readthedocs/projects/admin.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
ResourceUsageNotification,
3030
)
3131
from .tasks import remove_dirs
32+
from .tag_utils import import_tags
3233

3334

3435
class ProjectSendNotificationView(SendNotificationView):
@@ -155,6 +156,7 @@ class ProjectAdmin(admin.ModelAdmin):
155156
'build_default_version',
156157
'reindex_active_versions',
157158
'wipe_all_versions',
159+
'import_tags_from_vcs',
158160
]
159161

160162
def feature_flags(self, obj):
@@ -294,6 +296,24 @@ def wipe_all_versions(self, request, queryset):
294296

295297
wipe_all_versions.short_description = 'Wipe all versions from ES'
296298

299+
def import_tags_from_vcs(self, request, queryset):
300+
for project in queryset.iterator():
301+
tags = import_tags(project)
302+
if tags:
303+
self.message_user(
304+
request,
305+
'Imported tags for {}: {}'.format(project, tags),
306+
messages.SUCCESS
307+
)
308+
else:
309+
self.message_user(
310+
request,
311+
'No tags found for {}'.format(project),
312+
messages.WARNING
313+
)
314+
315+
import_tags_from_vcs.short_description = 'Import tags from the version control API'
316+
297317
def get_actions(self, request):
298318
actions = super().get_actions(request)
299319
actions['delete_selected'] = (
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""
2+
Cleanup project tags.
3+
4+
This specifically aims to cleanup:
5+
6+
- Differences only in lowercase/uppercase
7+
- Slugify all tags
8+
- Remove tags with no projects (old & spam mostly)
9+
10+
This command can probably be completely removed after being run.
11+
Future tags should be canonicalized because of the new tag parser in
12+
``readthedocs.projects.tag_utils.rtd_parse_tags``
13+
"""
14+
15+
from django.core.management.base import BaseCommand
16+
from taggit.utils import parse_tags, edit_string_for_tags
17+
18+
from readthedocs.projects.models import Project
19+
from readthedocs.projects.tag_utils import remove_unused_tags
20+
21+
22+
class Command(BaseCommand):
23+
24+
help = __doc__
25+
dry_run = False
26+
27+
def reprocess_tags(self):
28+
self.stdout.write('Reprocessing tags (lowercasing, slugifying, etc.)...')
29+
project_total = Project.objects.count()
30+
31+
# Use an iterator so the queryset isn't stored in memory
32+
# This may take a long time but should be memory efficient
33+
for i, project in enumerate(Project.objects.iterator()):
34+
old_tags_objs = list(project.tags.all())
35+
36+
if old_tags_objs:
37+
old_tags = sorted([t.name for t in old_tags_objs])
38+
old_tag_string = edit_string_for_tags(old_tags_objs)
39+
new_tags = parse_tags(old_tag_string)
40+
41+
# Update the tags on the project if they are different
42+
# Note: "parse_tags" handles sorting
43+
if new_tags != old_tags:
44+
if not self.dry_run:
45+
self.stdout.write(
46+
'[{}/{}] Setting tags on "{}"'.format(
47+
i + 1,
48+
project_total,
49+
project.slug,
50+
)
51+
)
52+
project.tags.set(*new_tags)
53+
else:
54+
self.stdout.write(
55+
'[{}/{}] Not setting tags on "{}" (dry run)'.format(
56+
i + 1,
57+
project_total,
58+
project.slug,
59+
)
60+
)
61+
62+
def remove_tags_with_no_projects(self):
63+
if not self.dry_run:
64+
self.stdout.write('Removing tags with no projects...')
65+
num_deleted, _ = remove_unused_tags()
66+
self.stdout.write('{} unused tags deleted'.format(num_deleted))
67+
68+
def add_arguments(self, parser):
69+
parser.add_argument(
70+
"--dry-run",
71+
action="store_true",
72+
help="Don't actually perform the actions, just print output",
73+
)
74+
parser.add_argument(
75+
"--remove-unused-only",
76+
action="store_true",
77+
help="Don't canonicalize tags, just delete unused",
78+
)
79+
80+
def handle(self, *args, **options):
81+
self.dry_run = options["dry_run"]
82+
83+
if not options["remove_unused_only"]:
84+
self.reprocess_tags()
85+
86+
self.remove_tags_with_no_projects()

readthedocs/projects/tag_utils.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""Customizations to Django Taggit."""
2+
from allauth.socialaccount.models import SocialApp
3+
from django.db.models import Count
4+
from django.utils.text import slugify
5+
import requests
6+
from taggit.models import Tag
7+
from taggit.utils import _parse_tags
8+
9+
from .constants import GITHUB_REGEXS
10+
11+
12+
def rtd_parse_tags(tag_string):
13+
"""
14+
Parses a string into its tags.
15+
16+
- Lowercases all tags
17+
- Converts underscores to hyphens
18+
- Slugifies tags
19+
20+
:see: https://django-taggit.readthedocs.io/page/custom_tagging.html
21+
:param tag_string: a delimited string of tags
22+
:return: a sorted list of tag strings
23+
"""
24+
if tag_string:
25+
tag_string = tag_string.lower().replace('_', '-')
26+
27+
return sorted([slugify(tag) for tag in _parse_tags(tag_string)])
28+
29+
30+
def remove_unused_tags():
31+
"""Removes all tags that have no corresponding items (projects)."""
32+
return Tag.objects.all().annotate(
33+
num=Count('taggit_taggeditem_items')
34+
).filter(num=0).delete()
35+
36+
37+
def import_tags(project):
38+
"""
39+
Import tags using the version control API.
40+
41+
Currently, this is only implemented for github.
42+
Uses the client ID and client secret for github otherwise the rate limit is 60/hr.
43+
https://developer.github.com/v3/#rate-limiting
44+
45+
:returns: A list of the tags set or ``None`` on an error
46+
"""
47+
user = repo = ''
48+
for regex in GITHUB_REGEXS:
49+
match = regex.search(project.repo)
50+
if match:
51+
user, repo = match.groups()
52+
break
53+
54+
if not user:
55+
return None
56+
57+
provider = SocialApp.objects.filter(provider='github').first()
58+
if not provider:
59+
return None
60+
61+
# https://developer.github.com/v3/repos/#list-all-topics-for-a-repository
62+
url = 'https://api.github.com/repos/{user}/{repo}/topics'.format(
63+
user=user,
64+
repo=repo,
65+
)
66+
headers = {
67+
# Getting topics is a preview API and may change
68+
# It requires this custom Accept header
69+
'Accept': 'application/vnd.github.mercy-preview+json',
70+
}
71+
params = {
72+
'client_id': provider.client_id,
73+
'client_secret': provider.secret,
74+
}
75+
76+
resp = requests.get(url, headers=headers, params=params)
77+
if resp.ok:
78+
tags = resp.json()['names']
79+
if tags:
80+
project.tags.set(*tags)
81+
return tags
82+
return []
83+
84+
return None

readthedocs/settings/base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,10 @@ def USE_PROMOS(self): # noqa
464464

465465
INTERNAL_IPS = ('127.0.0.1',)
466466

467+
# Taggit
468+
# https://django-taggit.readthedocs.io
469+
TAGGIT_TAGS_FROM_STRING = 'readthedocs.projects.tag_utils.rtd_parse_tags'
470+
467471
# Stripe
468472
STRIPE_SECRET = None
469473
STRIPE_PUBLISHABLE = None

0 commit comments

Comments
 (0)