Skip to content

Add support for unicode Git tags/branches #2997

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 76 additions & 2 deletions readthedocs/rtd_tests/tests/test_backend.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
from __future__ import absolute_import
# -*- coding: utf-8 -*-

from __future__ import absolute_import, unicode_literals

import re
from os.path import exists

from django.contrib.auth.models import User

from readthedocs.projects.models import Project
from readthedocs.rtd_tests.base import RTDTestCase

from readthedocs.rtd_tests.utils import make_test_git, make_test_hg
from readthedocs.vcs_support.backends.git import Backend


class TestGitBackend(RTDTestCase):

def setUp(self):
git_repo = make_test_git()
super(TestGitBackend, self).setUp()
Expand All @@ -24,6 +29,20 @@ def setUp(self):
)
self.project.users.add(self.eric)

def test_branch_regex(self):
data = " origin/HEAD -> origin/master"
self.assertRegexpMatches(data, Backend.BRANCH_REGEX)

data = """
origin/master
origin/HEAD -> origin/master
"""
matches = Backend.BRANCH_REGEX.findall(data)
self.assertEqual(matches, [
'origin/master',
'origin/HEAD -> origin/master',
])

def test_parse_branches(self):
data = """
develop
Expand All @@ -47,11 +66,34 @@ def test_parse_branches(self):
self.project.vcs_repo().parse_branches(data)]
self.assertEqual(expected_ids, given_ids)

def test_parse_unicode_branch(self):
data = """
origin/üñîçø∂é
"""
expected_ids = [('origin/üñîçø∂é', 'üñîçø∂é')]
given_ids = [(x.identifier, x.verbose_name) for x in
self.project.vcs_repo().parse_branches(data)]
self.assertEqual(expected_ids, given_ids)

def test_git_checkout(self):
repo = self.project.vcs_repo()
repo.checkout()
self.assertTrue(exists(repo.working_dir))

def test_tag_regex(self):
data = "bd533a768ff661991a689d3758fcfe72f455435d refs/tags/1.0"
self.assertRegexpMatches(data, Backend.TAG_REGEX)

data = """
3b32886c8d3cb815df3793b3937b2e91d0fb00f1 refs/tags/2.0.0
bd533a768ff661991a689d3758fcfe72f455435d refs/tags/2.0.1
"""
matches = Backend.TAG_REGEX.findall(data)
self.assertEqual(matches, [
('3b32886c8d3cb815df3793b3937b2e91d0fb00f1', 'refs/tags/2.0.0'),
('bd533a768ff661991a689d3758fcfe72f455435d', 'refs/tags/2.0.1'),
])

def test_parse_git_tags(self):
data = """\
3b32886c8d3cb815df3793b3937b2e91d0fb00f1 refs/tags/2.0.0
Expand All @@ -74,8 +116,21 @@ def test_parse_git_tags(self):
self.project.vcs_repo().parse_tags(data)]
self.assertEqual(expected_tags, given_ids)

def test_parse_unicode_git_tags(self):
data = """\
bd533a768ff661991a689d3758fcfe72f455435d refs/tags/release-ünîø∂é
"""
expected_tags = [
('bd533a768ff661991a689d3758fcfe72f455435d', 'release-ünîø∂é'),
]

given_ids = [(x.identifier, x.verbose_name) for x in
self.project.vcs_repo().parse_tags(data)]
self.assertEqual(expected_tags, given_ids)


class TestHgBackend(RTDTestCase):

def setUp(self):
hg_repo = make_test_hg()
super(TestHgBackend, self).setUp()
Expand All @@ -101,6 +156,13 @@ def test_parse_branches(self):
self.project.vcs_repo().parse_branches(data)]
self.assertEqual(expected_ids, given_ids)

def test_parse_unicode_branches(self):
data = "üñîçø∂é"
expected_ids = ['üñîçø∂é']
given_ids = [x.identifier for x in
self.project.vcs_repo().parse_branches(data)]
self.assertEqual(expected_ids, given_ids)

def test_checkout(self):
repo = self.project.vcs_repo()
repo.checkout()
Expand All @@ -122,3 +184,15 @@ def test_parse_tags(self):
given_ids = [(x.identifier, x.verbose_name) for x in
self.project.vcs_repo().parse_tags(data)]
self.assertEqual(expected_tags, given_ids)

def test_parse_unicode_tags(self):
data = """\
üñîçø∂é 13575:8e94a1b4e9a4
"""
expected_tags = [
('8e94a1b4e9a4', 'üñîçø∂é'),
]

given_ids = [(x.identifier, x.verbose_name) for x in
self.project.vcs_repo().parse_tags(data)]
self.assertEqual(expected_tags, given_ids)
80 changes: 39 additions & 41 deletions readthedocs/vcs_support/backends/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,13 @@

from __future__ import absolute_import

import re
import logging
import csv
import os
import re

from builtins import bytes, str # pylint: disable=redefined-builtin
from readthedocs.projects.exceptions import ProjectImportError
from readthedocs.vcs_support.base import BaseVCS, VCSVersion

from future import standard_library
standard_library.install_aliases()
from io import StringIO # noqa


log = logging.getLogger(__name__)

Expand All @@ -27,6 +21,25 @@ class Backend(BaseVCS):
supports_branches = True
fallback_branch = 'master' # default branch

TAG_REGEX = re.compile(
r'''
^\s*
(?P<hash>[0-9a-f]+)
\s+
(?P<tag>.*)
(?:\n|$)
''',
(re.VERBOSE | re.MULTILINE)
)
BRANCH_REGEX = re.compile(
r'''
^\s*
(?P<branch>\w.+)
(?:\n|$)
''',
(re.VERBOSE | re.MULTILINE)
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These could use some comments that explain them.


def __init__(self, *args, **kwargs):
super(Backend, self).__init__(*args, **kwargs)
self.token = kwargs.get('token', None)
Expand Down Expand Up @@ -115,19 +128,11 @@ def parse_tags(self, data):
hash as identifier.
"""
# parse the lines into a list of tuples (commit-hash, tag ref name)
# StringIO below is expecting Unicode data, so ensure that it gets it.
if not isinstance(data, str):
data = str(data)
raw_tags = csv.reader(StringIO(data), delimiter=' ')
vcs_tags = []
for row in raw_tags:
row = [f for f in row if f != '']
if row == []:
continue
commit_hash, name = row
clean_name = name.split('/')[-1]
vcs_tags.append(VCSVersion(self, commit_hash, clean_name))
return vcs_tags
tags = []
for match in self.TAG_REGEX.finditer(data):
tag = match.group('tag').split('/')[-1]
tags.append(VCSVersion(self, match.group('hash'), tag))
return tags

@property
def branches(self):
Expand All @@ -151,27 +156,20 @@ def parse_branches(self, data):
origin/release/2.0.0
origin/release/2.1.0
"""
clean_branches = []
# StringIO below is expecting Unicode data, so ensure that it gets it.
if not isinstance(data, str):
data = str(data)
raw_branches = csv.reader(StringIO(data), delimiter=' ')
for branch in raw_branches:
branch = [f for f in branch if f != '' and f != '*']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'*' and '' are excluded because of the regex (\w doesn't match '*', so it that split things properly then this would be okay.

# Handle empty branches
if branch:
branch = branch[0]
if branch.startswith('origin/'):
cut_len = len('origin/')
slug = branch[cut_len:].replace('/', '-')
if slug in ['HEAD']:
continue
clean_branches.append(VCSVersion(self, branch, slug))
else:
# Believe this is dead code.
slug = branch.replace('/', '-')
clean_branches.append(VCSVersion(self, branch, slug))
return clean_branches
branches = []
for match in self.BRANCH_REGEX.finditer(data):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like we're depending on some regex magic here that isn't easy to understand. It would be good to document exactly how this is working. Are we still handling empty branches, and * names? Are they getting filtered out with regex magic?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what is meant by empty branches or * names. I only replicated what the csv parser was doing, no regex magic here, just basic regex to split on whitespace.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The previous code was doing branch = [f for f in branch if f != '' and f != '*'], which seemed to be filtering out things?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add some test cases for this. I missed the '*', but empty is likely not being caught anyways.

branch = match.group('branch')
if branch.startswith('origin/HEAD'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

origin/HEAD-this-is-not would match, although unlikely, this should be an equality test.

continue
elif branch.startswith('origin/'):
cut_len = len('origin/')
slug = branch[cut_len:].replace('/', '-')
branches.append(VCSVersion(self, branch, slug))
else:
# Believe this is dead code.
slug = branch.replace('/', '-')
branches.append(VCSVersion(self, branch, slug))
return branches

@property
def commit(self):
Expand Down
2 changes: 2 additions & 0 deletions readthedocs/vcs_support/backends/hg.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Mercurial-related utilities."""

from __future__ import absolute_import

from readthedocs.projects.exceptions import ProjectImportError
from readthedocs.vcs_support.base import BaseVCS, VCSVersion

Expand Down