Skip to content

Commit a3e47b5

Browse files
committed
Improvements based on proposed changes for tag cleanup
- Make importing tags for a project an admin function - Use the github app Oauth credentials - Make removing unused tags a reusable function so it can be reused in the future by celery
1 parent 3dc3e60 commit a3e47b5

File tree

6 files changed

+128
-83
lines changed

6 files changed

+128
-83
lines changed
Lines changed: 6 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,74 +1,28 @@
1-
"""
2-
Import a project's tags from GitHub.
3-
4-
Requires a ``GITHUB_AUTH_TOKEN`` to be set in the environment.
5-
This should be a "Personal access token" although no permissions are required.
6-
With the token, the rate limit is increased to 5,000 requests/hour
7-
8-
https://github.com/settings/tokens
9-
https://developer.github.com/v3/#rate-limiting
10-
"""
11-
12-
import os
1+
"""Imports tags for projects without tags from GitHub."""
132
import time
143

15-
import requests
16-
from django.core.management.base import BaseCommand, CommandError
4+
from django.core.management.base import BaseCommand
175

18-
from readthedocs.projects.constants import GITHUB_REGEXS
196
from readthedocs.projects.models import Project
7+
from readthedocs.projects.tag_utils import import_tags
208

219

2210
class Command(BaseCommand):
2311

2412
help = __doc__
2513

2614
def handle(self, *args, **options):
27-
token = os.environ.get('GITHUB_AUTH_TOKEN')
28-
if not token:
29-
raise CommandError('Invalid GitHub token, exiting...')
30-
3115
queryset = Project.objects.filter(tags=None).filter(repo__contains='github.com')
3216
project_total = queryset.count()
3317

3418
for i, project in enumerate(queryset.iterator()):
35-
# Get the user and repo name from the URL as required by GitHub's API
36-
user = repo = ''
37-
for regex in GITHUB_REGEXS:
38-
match = regex.search(project.repo)
39-
if match:
40-
user, repo = match.groups()
41-
break
42-
43-
if not user:
44-
self.stderr.write(f'No GitHub repo for {project.repo}')
45-
continue
46-
47-
# https://developer.github.com/v3/repos/#list-all-topics-for-a-repository
48-
url = 'https://api.github.com/repos/{user}/{repo}/topics'.format(
49-
user=user,
50-
repo=repo,
51-
)
52-
headers = {
53-
'Authorization': 'token {token}'.format(token=token),
54-
55-
# Getting topics is a preview API and may change
56-
# It requires this custom Accept header
57-
'Accept': 'application/vnd.github.mercy-preview+json',
58-
}
59-
6019
self.stdout.write(
6120
'[{}/{}] Fetching tags for {}'.format(i + 1, project_total, project.slug)
6221
)
6322

64-
resp = requests.get(url, headers=headers)
65-
if resp.ok:
66-
tags = resp.json()['names']
67-
if tags:
68-
self.stdout.write('Setting tags for {}: {}'.format(project.slug, tags))
69-
project.tags.set(*tags)
70-
else:
71-
self.stderr.write('GitHub API error ({}): {}'.format(project.slug, resp.content))
23+
tags = import_tags(project)
24+
if tags:
25+
self.stdout.write('Set tags for {}: {}'.format(project.slug, tags))
7226

7327
# Sleeping half a second should keep us under 5k requests/hour
7428
time.sleep(0.5)

readthedocs/core/tag_utils.py

Lines changed: 0 additions & 20 deletions
This file was deleted.

readthedocs/projects/admin.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
ResourceUsageNotification,
3030
)
3131
from .tasks import remove_dirs
32+
from .tag_utils import import_tags
3233

3334

3435
class ProjectSendNotificationView(SendNotificationView):
@@ -155,6 +156,7 @@ class ProjectAdmin(admin.ModelAdmin):
155156
'build_default_version',
156157
'reindex_active_versions',
157158
'wipe_all_versions',
159+
'import_tags_from_vcs',
158160
]
159161

160162
def feature_flags(self, obj):
@@ -294,6 +296,24 @@ def wipe_all_versions(self, request, queryset):
294296

295297
wipe_all_versions.short_description = 'Wipe all versions from ES'
296298

299+
def import_tags_from_vcs(self, request, queryset):
300+
for project in queryset.iterator():
301+
tags = import_tags(project)
302+
if tags:
303+
self.message_user(
304+
request,
305+
'Imported tags for {}: {}'.format(project, tags),
306+
messages.SUCCESS
307+
)
308+
else:
309+
self.message_user(
310+
request,
311+
'No tags found for {}'.format(project),
312+
messages.WARNING
313+
)
314+
315+
import_tags_from_vcs.short_description = 'Import tags from the version control API'
316+
297317
def get_actions(self, request):
298318
actions = super().get_actions(request)
299319
actions['delete_selected'] = (

readthedocs/core/management/commands/clean_tags.py renamed to readthedocs/projects/management/commands/clean_tags.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,17 @@
66
- Differences only in lowercase/uppercase
77
- Slugify all tags
88
- Remove tags with no projects (old & spam mostly)
9+
10+
This command can probably be completely removed after being run.
11+
Future tags should be canonicalized because of the new tag parser in
12+
``readthedocs.projects.tag_utils.rtd_parse_tags``
913
"""
1014

1115
from django.core.management.base import BaseCommand
12-
from django.db.models import Count
13-
from taggit.models import Tag
1416
from taggit.utils import parse_tags, edit_string_for_tags
1517

1618
from readthedocs.projects.models import Project
19+
from readthedocs.projects.tag_utils import remove_unused_tags
1720

1821

1922
class Command(BaseCommand):
@@ -57,23 +60,27 @@ def reprocess_tags(self):
5760
)
5861

5962
def remove_tags_with_no_projects(self):
60-
self.stdout.write('Removing tags with no projects...')
61-
for tag in Tag.objects.all().annotate(num=Count('taggit_taggeditem_items')).filter(num=0):
62-
if not self.dry_run:
63-
self.stdout.write('Removing tag {}'.format(tag.name))
64-
tag.delete()
65-
else:
66-
self.stdout.write('Not removing tag "{}" (dry run)'.format(tag.name))
63+
if not self.dry_run:
64+
self.stdout.write('Removing tags with no projects...')
65+
num_deleted, _ = remove_unused_tags()
66+
self.stdout.write('{} unused tags deleted'.format(num_deleted))
6767

6868
def add_arguments(self, parser):
6969
parser.add_argument(
7070
"--dry-run",
7171
action="store_true",
7272
help="Don't actually perform the actions, just print output",
7373
)
74+
parser.add_argument(
75+
"--remove-unused-only",
76+
action="store_true",
77+
help="Don't canonicalize tags, just delete unused",
78+
)
7479

7580
def handle(self, *args, **options):
7681
self.dry_run = options["dry_run"]
7782

78-
self.reprocess_tags()
83+
if not options["remove_unused_only"]:
84+
self.reprocess_tags()
85+
7986
self.remove_tags_with_no_projects()

readthedocs/projects/tag_utils.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""Customizations to Django Taggit."""
2+
from allauth.socialaccount.models import SocialApp
3+
from django.db.models import Count
4+
from django.utils.text import slugify
5+
import requests
6+
from taggit.models import Tag
7+
from taggit.utils import _parse_tags
8+
9+
from .constants import GITHUB_REGEXS
10+
11+
12+
def rtd_parse_tags(tag_string):
13+
"""
14+
Parses a string into its tags.
15+
16+
- Lowercases all tags
17+
- Converts underscores to hyphens
18+
- Slugifies tags
19+
20+
:see: https://django-taggit.readthedocs.io/page/custom_tagging.html
21+
:param tag_string: a delimited string of tags
22+
:return: a sorted list of tag strings
23+
"""
24+
if tag_string:
25+
tag_string = tag_string.lower().replace('_', '-')
26+
27+
return sorted([slugify(tag) for tag in _parse_tags(tag_string)])
28+
29+
30+
def remove_unused_tags():
31+
"""Removes all tags that have no corresponding items (projects)."""
32+
return Tag.objects.all().annotate(
33+
num=Count('taggit_taggeditem_items')
34+
).filter(num=0).delete()
35+
36+
37+
def import_tags(project):
38+
"""
39+
Import tags using the version control API.
40+
41+
Currently, this is only implemented for github.
42+
Uses the client ID and client secret for github otherwise the rate limit is 60/hr.
43+
https://developer.github.com/v3/#rate-limiting
44+
45+
:returns: A list of the tags set or ``None`` on an error
46+
"""
47+
user = repo = ''
48+
for regex in GITHUB_REGEXS:
49+
match = regex.search(project.repo)
50+
if match:
51+
user, repo = match.groups()
52+
break
53+
54+
if not user:
55+
return None
56+
57+
provider = SocialApp.objects.filter(provider='github').first()
58+
if not provider:
59+
return None
60+
61+
# https://developer.github.com/v3/repos/#list-all-topics-for-a-repository
62+
url = 'https://api.github.com/repos/{user}/{repo}/topics'.format(
63+
user=user,
64+
repo=repo,
65+
)
66+
headers = {
67+
# Getting topics is a preview API and may change
68+
# It requires this custom Accept header
69+
'Accept': 'application/vnd.github.mercy-preview+json',
70+
}
71+
params = {
72+
'client_id': provider.client_id,
73+
'client_secret': provider.secret,
74+
}
75+
76+
resp = requests.get(url, headers=headers, params=params)
77+
if resp.ok:
78+
tags = resp.json()['names']
79+
if tags:
80+
project.tags.set(*tags)
81+
return tags
82+
return []
83+
84+
return None

readthedocs/settings/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ def USE_PROMOS(self): # noqa
465465

466466
# Taggit
467467
# https://django-taggit.readthedocs.io
468-
TAGGIT_TAGS_FROM_STRING = 'readthedocs.core.tag_utils.rtd_parse_tags'
468+
TAGGIT_TAGS_FROM_STRING = 'readthedocs.projects.tag_utils.rtd_parse_tags'
469469

470470
# Stripe
471471
STRIPE_SECRET = None

0 commit comments

Comments
 (0)