From 5661e745323747d0eda66e2b61ea312537beaa9c Mon Sep 17 00:00:00 2001 From: David Fischer Date: Tue, 23 Jul 2019 16:09:20 -0700 Subject: [PATCH 1/4] Cleanup project tags - Adds a management command to clean up tags - Cleanup involves lowercasing and slugifying (canonicalizing) - Future tags will come in canonicalized --- .../core/management/commands/clean_tags.py | 75 +++++++++++++++++++ readthedocs/core/tag_utils.py | 20 +++++ readthedocs/settings/base.py | 4 + 3 files changed, 99 insertions(+) create mode 100644 readthedocs/core/management/commands/clean_tags.py create mode 100644 readthedocs/core/tag_utils.py diff --git a/readthedocs/core/management/commands/clean_tags.py b/readthedocs/core/management/commands/clean_tags.py new file mode 100644 index 00000000000..fabcab743e6 --- /dev/null +++ b/readthedocs/core/management/commands/clean_tags.py @@ -0,0 +1,75 @@ +""" +Cleanup project tags + +This specifically aims to cleanup: + +- Differences only in lowercase/uppercase +- Slugify all tags +- Remove tags with no projects (old & spam mostly) +""" + +from django.core.management.base import BaseCommand +from django.db.models import Count +from taggit.models import Tag +from taggit.utils import parse_tags, edit_string_for_tags + +from readthedocs.projects.models import Project + + +class Command(BaseCommand): + + help = __doc__ + dry_run = False + + def reprocess_tags(self): + self.stdout.write('Reprocessing tags (lowercasing, slugifying, etc.)...') + project_total = Project.objects.count() + + # Use an iterator so the queryset isn't stored in memory + # This may take a long time but should be memory efficient + for i, project in enumerate(Project.objects.iterator()): + old_tags_objs = list(project.tags.all()) + + if old_tags_objs: + old_tags = sorted([t.name for t in old_tags_objs]) + old_tag_string = edit_string_for_tags(old_tags_objs) + new_tags = parse_tags(old_tag_string) + + # Update the tags on the project if they are different + # Note: "parse_tags" handles sorting + if new_tags != old_tags: + if not self.dry_run: + self.stdout.write( + '[{}/{}] Setting tags on "{}"'.format(i, project_total, project.slug) + ) + project.tags.set(*new_tags) + else: + self.stdout.write( + '[{}/{}] Not setting tags on "{}" (dry run)'.format( + i, + project_total, + project.slug, + ), + ) + + def remove_tags_with_no_projects(self): + self.stdout.write('Removing tags with no projects...') + for tag in Tag.objects.all().annotate(num=Count('taggit_taggeditem_items')).filter(num=0): + if not self.dry_run: + self.stdout.write('Removing tag {}'.format(tag.name)) + tag.delete() + else: + self.stdout.write('Not removing tag "{}" (dry run)'.format(tag.name)) + + def add_arguments(self, parser): + parser.add_argument( + "--dry-run", + action="store_true", + help="Don't actually perform the actions, just print output", + ) + + def handle(self, *args, **options): + self.dry_run = options["dry_run"] + + self.reprocess_tags() + self.remove_tags_with_no_projects() diff --git a/readthedocs/core/tag_utils.py b/readthedocs/core/tag_utils.py new file mode 100644 index 00000000000..1d801fed068 --- /dev/null +++ b/readthedocs/core/tag_utils.py @@ -0,0 +1,20 @@ +"""Customizations to Django Taggit""" +from django.utils.text import slugify +from taggit.utils import _parse_tags + + +def rtd_parse_tags(tag_string): + """ + Parses a string into its tags + + - Lowercases all tags + - Slugifies tags + + :see: https://django-taggit.readthedocs.io/page/custom_tagging.html + :param tag_string: a delimited string of tags + :return: a sorted list of tag strings + """ + if tag_string: + tag_string = tag_string.lower() + + return [slugify(tag) for tag in _parse_tags(tag_string)] diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 3ae10d384f4..d5d677efc80 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -463,6 +463,10 @@ def USE_PROMOS(self): # noqa INTERNAL_IPS = ('127.0.0.1',) + # Taggit + # https://django-taggit.readthedocs.io + TAGGIT_TAGS_FROM_STRING = 'readthedocs.core.tag_utils.rtd_parse_tags' + # Stripe STRIPE_SECRET = None STRIPE_PUBLISHABLE = None From 6b936a72bfe18baead22768b7c861dafb9815885 Mon Sep 17 00:00:00 2001 From: David Fischer Date: Tue, 23 Jul 2019 17:31:49 -0700 Subject: [PATCH 2/4] Fix linting issues --- readthedocs/core/management/commands/clean_tags.py | 2 +- readthedocs/core/tag_utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/readthedocs/core/management/commands/clean_tags.py b/readthedocs/core/management/commands/clean_tags.py index fabcab743e6..b034d5ebf06 100644 --- a/readthedocs/core/management/commands/clean_tags.py +++ b/readthedocs/core/management/commands/clean_tags.py @@ -1,5 +1,5 @@ """ -Cleanup project tags +Cleanup project tags. This specifically aims to cleanup: diff --git a/readthedocs/core/tag_utils.py b/readthedocs/core/tag_utils.py index 1d801fed068..00fe61cf2ef 100644 --- a/readthedocs/core/tag_utils.py +++ b/readthedocs/core/tag_utils.py @@ -1,11 +1,11 @@ -"""Customizations to Django Taggit""" +"""Customizations to Django Taggit.""" from django.utils.text import slugify from taggit.utils import _parse_tags def rtd_parse_tags(tag_string): """ - Parses a string into its tags + Parses a string into its tags. - Lowercases all tags - Slugifies tags From 3dc3e60679bec59e07b6d1bd6b67d5d80d55e747 Mon Sep 17 00:00:00 2001 From: David Fischer Date: Tue, 23 Jul 2019 22:33:19 -0700 Subject: [PATCH 3/4] Add a management command for importing tags from github --- .../core/management/commands/clean_tags.py | 10 ++- .../management/commands/import_github_tags.py | 74 +++++++++++++++++++ 2 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 readthedocs/core/management/commands/import_github_tags.py diff --git a/readthedocs/core/management/commands/clean_tags.py b/readthedocs/core/management/commands/clean_tags.py index b034d5ebf06..de60bd5adf2 100644 --- a/readthedocs/core/management/commands/clean_tags.py +++ b/readthedocs/core/management/commands/clean_tags.py @@ -40,16 +40,20 @@ def reprocess_tags(self): if new_tags != old_tags: if not self.dry_run: self.stdout.write( - '[{}/{}] Setting tags on "{}"'.format(i, project_total, project.slug) + '[{}/{}] Setting tags on "{}"'.format( + i + 1, + project_total, + project.slug, + ) ) project.tags.set(*new_tags) else: self.stdout.write( '[{}/{}] Not setting tags on "{}" (dry run)'.format( - i, + i + 1, project_total, project.slug, - ), + ) ) def remove_tags_with_no_projects(self): diff --git a/readthedocs/core/management/commands/import_github_tags.py b/readthedocs/core/management/commands/import_github_tags.py new file mode 100644 index 00000000000..57a17d9e57f --- /dev/null +++ b/readthedocs/core/management/commands/import_github_tags.py @@ -0,0 +1,74 @@ +""" +Import a project's tags from GitHub. + +Requires a ``GITHUB_AUTH_TOKEN`` to be set in the environment. +This should be a "Personal access token" although no permissions are required. +With the token, the rate limit is increased to 5,000 requests/hour + +https://github.com/settings/tokens +https://developer.github.com/v3/#rate-limiting +""" + +import os +import time + +import requests +from django.core.management.base import BaseCommand, CommandError + +from readthedocs.projects.constants import GITHUB_REGEXS +from readthedocs.projects.models import Project + + +class Command(BaseCommand): + + help = __doc__ + + def handle(self, *args, **options): + token = os.environ.get('GITHUB_AUTH_TOKEN') + if not token: + raise CommandError('Invalid GitHub token, exiting...') + + queryset = Project.objects.filter(tags=None).filter(repo__contains='github.com') + project_total = queryset.count() + + for i, project in enumerate(queryset.iterator()): + # Get the user and repo name from the URL as required by GitHub's API + user = repo = '' + for regex in GITHUB_REGEXS: + match = regex.search(project.repo) + if match: + user, repo = match.groups() + break + + if not user: + self.stderr.write(f'No GitHub repo for {project.repo}') + continue + + # https://developer.github.com/v3/repos/#list-all-topics-for-a-repository + url = 'https://api.github.com/repos/{user}/{repo}/topics'.format( + user=user, + repo=repo, + ) + headers = { + 'Authorization': 'token {token}'.format(token=token), + + # Getting topics is a preview API and may change + # It requires this custom Accept header + 'Accept': 'application/vnd.github.mercy-preview+json', + } + + self.stdout.write( + '[{}/{}] Fetching tags for {}'.format(i + 1, project_total, project.slug) + ) + + resp = requests.get(url, headers=headers) + if resp.ok: + tags = resp.json()['names'] + if tags: + self.stdout.write('Setting tags for {}: {}'.format(project.slug, tags)) + project.tags.set(*tags) + else: + self.stderr.write('GitHub API error ({}): {}'.format(project.slug, resp.content)) + + # Sleeping half a second should keep us under 5k requests/hour + time.sleep(0.5) From a3e47b546870553eac80bf1036178d7bdd0c2b01 Mon Sep 17 00:00:00 2001 From: David Fischer Date: Thu, 25 Jul 2019 13:18:59 -0700 Subject: [PATCH 4/4] Improvements based on proposed changes for tag cleanup - Make importing tags for a project an admin function - Use the github app Oauth credentials - Make removing unused tags a reusable function so it can be reused in the future by celery --- .../management/commands/import_github_tags.py | 58 ++----------- readthedocs/core/tag_utils.py | 20 ----- readthedocs/projects/admin.py | 20 +++++ .../management/commands/clean_tags.py | 27 +++--- readthedocs/projects/tag_utils.py | 84 +++++++++++++++++++ readthedocs/settings/base.py | 2 +- 6 files changed, 128 insertions(+), 83 deletions(-) delete mode 100644 readthedocs/core/tag_utils.py rename readthedocs/{core => projects}/management/commands/clean_tags.py (76%) create mode 100644 readthedocs/projects/tag_utils.py diff --git a/readthedocs/core/management/commands/import_github_tags.py b/readthedocs/core/management/commands/import_github_tags.py index 57a17d9e57f..2c6a15a3ebc 100644 --- a/readthedocs/core/management/commands/import_github_tags.py +++ b/readthedocs/core/management/commands/import_github_tags.py @@ -1,22 +1,10 @@ -""" -Import a project's tags from GitHub. - -Requires a ``GITHUB_AUTH_TOKEN`` to be set in the environment. -This should be a "Personal access token" although no permissions are required. -With the token, the rate limit is increased to 5,000 requests/hour - -https://github.com/settings/tokens -https://developer.github.com/v3/#rate-limiting -""" - -import os +"""Imports tags for projects without tags from GitHub.""" import time -import requests -from django.core.management.base import BaseCommand, CommandError +from django.core.management.base import BaseCommand -from readthedocs.projects.constants import GITHUB_REGEXS from readthedocs.projects.models import Project +from readthedocs.projects.tag_utils import import_tags class Command(BaseCommand): @@ -24,51 +12,17 @@ class Command(BaseCommand): help = __doc__ def handle(self, *args, **options): - token = os.environ.get('GITHUB_AUTH_TOKEN') - if not token: - raise CommandError('Invalid GitHub token, exiting...') - queryset = Project.objects.filter(tags=None).filter(repo__contains='github.com') project_total = queryset.count() for i, project in enumerate(queryset.iterator()): - # Get the user and repo name from the URL as required by GitHub's API - user = repo = '' - for regex in GITHUB_REGEXS: - match = regex.search(project.repo) - if match: - user, repo = match.groups() - break - - if not user: - self.stderr.write(f'No GitHub repo for {project.repo}') - continue - - # https://developer.github.com/v3/repos/#list-all-topics-for-a-repository - url = 'https://api.github.com/repos/{user}/{repo}/topics'.format( - user=user, - repo=repo, - ) - headers = { - 'Authorization': 'token {token}'.format(token=token), - - # Getting topics is a preview API and may change - # It requires this custom Accept header - 'Accept': 'application/vnd.github.mercy-preview+json', - } - self.stdout.write( '[{}/{}] Fetching tags for {}'.format(i + 1, project_total, project.slug) ) - resp = requests.get(url, headers=headers) - if resp.ok: - tags = resp.json()['names'] - if tags: - self.stdout.write('Setting tags for {}: {}'.format(project.slug, tags)) - project.tags.set(*tags) - else: - self.stderr.write('GitHub API error ({}): {}'.format(project.slug, resp.content)) + tags = import_tags(project) + if tags: + self.stdout.write('Set tags for {}: {}'.format(project.slug, tags)) # Sleeping half a second should keep us under 5k requests/hour time.sleep(0.5) diff --git a/readthedocs/core/tag_utils.py b/readthedocs/core/tag_utils.py deleted file mode 100644 index 00fe61cf2ef..00000000000 --- a/readthedocs/core/tag_utils.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Customizations to Django Taggit.""" -from django.utils.text import slugify -from taggit.utils import _parse_tags - - -def rtd_parse_tags(tag_string): - """ - Parses a string into its tags. - - - Lowercases all tags - - Slugifies tags - - :see: https://django-taggit.readthedocs.io/page/custom_tagging.html - :param tag_string: a delimited string of tags - :return: a sorted list of tag strings - """ - if tag_string: - tag_string = tag_string.lower() - - return [slugify(tag) for tag in _parse_tags(tag_string)] diff --git a/readthedocs/projects/admin.py b/readthedocs/projects/admin.py index fe39ce49f06..2f9d6f8acf0 100644 --- a/readthedocs/projects/admin.py +++ b/readthedocs/projects/admin.py @@ -29,6 +29,7 @@ ResourceUsageNotification, ) from .tasks import remove_dirs +from .tag_utils import import_tags class ProjectSendNotificationView(SendNotificationView): @@ -155,6 +156,7 @@ class ProjectAdmin(admin.ModelAdmin): 'build_default_version', 'reindex_active_versions', 'wipe_all_versions', + 'import_tags_from_vcs', ] def feature_flags(self, obj): @@ -294,6 +296,24 @@ def wipe_all_versions(self, request, queryset): wipe_all_versions.short_description = 'Wipe all versions from ES' + def import_tags_from_vcs(self, request, queryset): + for project in queryset.iterator(): + tags = import_tags(project) + if tags: + self.message_user( + request, + 'Imported tags for {}: {}'.format(project, tags), + messages.SUCCESS + ) + else: + self.message_user( + request, + 'No tags found for {}'.format(project), + messages.WARNING + ) + + import_tags_from_vcs.short_description = 'Import tags from the version control API' + def get_actions(self, request): actions = super().get_actions(request) actions['delete_selected'] = ( diff --git a/readthedocs/core/management/commands/clean_tags.py b/readthedocs/projects/management/commands/clean_tags.py similarity index 76% rename from readthedocs/core/management/commands/clean_tags.py rename to readthedocs/projects/management/commands/clean_tags.py index de60bd5adf2..d6d38aa692e 100644 --- a/readthedocs/core/management/commands/clean_tags.py +++ b/readthedocs/projects/management/commands/clean_tags.py @@ -6,14 +6,17 @@ - Differences only in lowercase/uppercase - Slugify all tags - Remove tags with no projects (old & spam mostly) + +This command can probably be completely removed after being run. +Future tags should be canonicalized because of the new tag parser in +``readthedocs.projects.tag_utils.rtd_parse_tags`` """ from django.core.management.base import BaseCommand -from django.db.models import Count -from taggit.models import Tag from taggit.utils import parse_tags, edit_string_for_tags from readthedocs.projects.models import Project +from readthedocs.projects.tag_utils import remove_unused_tags class Command(BaseCommand): @@ -57,13 +60,10 @@ def reprocess_tags(self): ) def remove_tags_with_no_projects(self): - self.stdout.write('Removing tags with no projects...') - for tag in Tag.objects.all().annotate(num=Count('taggit_taggeditem_items')).filter(num=0): - if not self.dry_run: - self.stdout.write('Removing tag {}'.format(tag.name)) - tag.delete() - else: - self.stdout.write('Not removing tag "{}" (dry run)'.format(tag.name)) + if not self.dry_run: + self.stdout.write('Removing tags with no projects...') + num_deleted, _ = remove_unused_tags() + self.stdout.write('{} unused tags deleted'.format(num_deleted)) def add_arguments(self, parser): parser.add_argument( @@ -71,9 +71,16 @@ def add_arguments(self, parser): action="store_true", help="Don't actually perform the actions, just print output", ) + parser.add_argument( + "--remove-unused-only", + action="store_true", + help="Don't canonicalize tags, just delete unused", + ) def handle(self, *args, **options): self.dry_run = options["dry_run"] - self.reprocess_tags() + if not options["remove_unused_only"]: + self.reprocess_tags() + self.remove_tags_with_no_projects() diff --git a/readthedocs/projects/tag_utils.py b/readthedocs/projects/tag_utils.py new file mode 100644 index 00000000000..b0b0412e0d6 --- /dev/null +++ b/readthedocs/projects/tag_utils.py @@ -0,0 +1,84 @@ +"""Customizations to Django Taggit.""" +from allauth.socialaccount.models import SocialApp +from django.db.models import Count +from django.utils.text import slugify +import requests +from taggit.models import Tag +from taggit.utils import _parse_tags + +from .constants import GITHUB_REGEXS + + +def rtd_parse_tags(tag_string): + """ + Parses a string into its tags. + + - Lowercases all tags + - Converts underscores to hyphens + - Slugifies tags + + :see: https://django-taggit.readthedocs.io/page/custom_tagging.html + :param tag_string: a delimited string of tags + :return: a sorted list of tag strings + """ + if tag_string: + tag_string = tag_string.lower().replace('_', '-') + + return sorted([slugify(tag) for tag in _parse_tags(tag_string)]) + + +def remove_unused_tags(): + """Removes all tags that have no corresponding items (projects).""" + return Tag.objects.all().annotate( + num=Count('taggit_taggeditem_items') + ).filter(num=0).delete() + + +def import_tags(project): + """ + Import tags using the version control API. + + Currently, this is only implemented for github. + Uses the client ID and client secret for github otherwise the rate limit is 60/hr. + https://developer.github.com/v3/#rate-limiting + + :returns: A list of the tags set or ``None`` on an error + """ + user = repo = '' + for regex in GITHUB_REGEXS: + match = regex.search(project.repo) + if match: + user, repo = match.groups() + break + + if not user: + return None + + provider = SocialApp.objects.filter(provider='github').first() + if not provider: + return None + + # https://developer.github.com/v3/repos/#list-all-topics-for-a-repository + url = 'https://api.github.com/repos/{user}/{repo}/topics'.format( + user=user, + repo=repo, + ) + headers = { + # Getting topics is a preview API and may change + # It requires this custom Accept header + 'Accept': 'application/vnd.github.mercy-preview+json', + } + params = { + 'client_id': provider.client_id, + 'client_secret': provider.secret, + } + + resp = requests.get(url, headers=headers, params=params) + if resp.ok: + tags = resp.json()['names'] + if tags: + project.tags.set(*tags) + return tags + return [] + + return None diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index d5d677efc80..07c67e07b50 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -465,7 +465,7 @@ def USE_PROMOS(self): # noqa # Taggit # https://django-taggit.readthedocs.io - TAGGIT_TAGS_FROM_STRING = 'readthedocs.core.tag_utils.rtd_parse_tags' + TAGGIT_TAGS_FROM_STRING = 'readthedocs.projects.tag_utils.rtd_parse_tags' # Stripe STRIPE_SECRET = None