python3: decouple bytes and strings

Totktonada · ligurio · ligurio · commit 65e47910af71 · 2021-03-12T15:04:24.000+03:00
In Python 2 default string type (`<str>`) is a binary string, non-unicode. We receive data from a socket, from a Popen stream, from a file as a string and operate on those strings without any conversions. Python 3 draws a line here. We usually operate on unicode strings in the code (because this is the default string type, `<str>`), but receive bytes from a socket and a Popen stream. We can use unicode or binary streams for files (unicode by default[^1]). This commit decouples bytes and strings. In most cases it means that we convert data from bytes to a string after receiving from a socket / Popen stream and convert it back from a string to bytes before writting to a socket. Those operations are no-op on Python 2. So, the general rule for our APIs is to accept and return `<str>` disregarding Python version. Not `<bytes>`, not `<unicode>`. The only non-trivial change is around `FilteredStream` and writes into `sys.stdout`. The `FilteredStream` instance replaces `sys.stdout` during execution of a test, so it should follow the usual convention and accept `<str>` in the `write()` method. This is both intuitive and necessary, because `*.py` tests rely on `print('bla bla')` to write into a result file. However the stream should also accept `<bytes>`, because we have a unit test (`unit/json.test`), which produces a binary output, which does not conform UTF-8 encoding. The separate `write_bytes()` method was introduced for this sake. UnittestServer and AppServer write tests output as bytes directly, TarantoolServer rely on the usual string output. We also use bytes directly, when write from one stream to another one: in `app_server.py` for stderr (writting to a log file), in `tarantool_server.py` for log destination property (because it is the destination for Popen). [^1]: Technically it depends on a system locale, but, hey, does anyone see a non UTF-8 locale after the millennium? Part of #20 Co-authored-by: Sergey Bronnikov <sergeyb@tarantool.org>
diff --git a/lib/admin_connection.py b/lib/admin_connection.py
@@ -28,6 +28,8 @@
 from lib.tarantool_connection import TarantoolPool
 from lib.tarantool_connection import TarantoolAsyncConnection
 
+from lib.utils import bytes_to_str
+from lib.utils import str_to_bytes
 
 ADMIN_SEPARATOR = '\n'
 
@@ -36,13 +38,13 @@ def get_handshake(sock, length=128, max_try=100):
     """
     Correct way to get tarantool handshake
     """
-    result = ""
+    result = b""
     i = 0
     while len(result) != length and i < max_try:
-        result = "%s%s" % (result, sock.recv(length-len(result)))
+        result = b"%s%s" % (result, sock.recv(length-len(result)))
         # max_try counter for tarantool/gh-1362
         i += 1
-    return result
+    return bytes_to_str(result)
 
 
 class AdminPool(TarantoolPool):
@@ -61,12 +63,12 @@ def _new_connection(self):
 
 class ExecMixIn(object):
     def cmd(self, socket, cmd, silent):
-        socket.sendall(cmd)
+        socket.sendall(str_to_bytes(cmd))
 
         bufsiz = 4096
         res = ""
         while True:
-            buf = socket.recv(bufsiz)
+            buf = bytes_to_str(socket.recv(bufsiz))
             if not buf:
                 break
             res = res + buf
diff --git a/lib/app_server.py b/lib/app_server.py
@@ -38,8 +38,8 @@ def run_server(execs, cwd, server, logfile, retval):
     timer.start()
     stdout, stderr = server.process.communicate()
     timer.cancel()
-    sys.stdout.write(stdout)
-    with open(logfile, 'a') as f:
+    sys.stdout.write_bytes(stdout)
+    with open(logfile, 'ab') as f:
         f.write(stderr)
     retval['returncode'] = server.process.wait()
     server.process = None
diff --git a/lib/inspector.py b/lib/inspector.py
@@ -6,8 +6,10 @@
 from gevent.lock import Semaphore
 from gevent.server import StreamServer
 
+from lib.utils import bytes_to_str
 from lib.utils import find_port
 from lib.utils import prefix_each_line
+from lib.utils import str_to_bytes
 from lib.colorer import color_stdout
 from lib.colorer import color_log
 from lib.colorer import qa_notice
@@ -77,7 +79,7 @@ def readline(socket, delimiter='\n', size=4096):
 
         while data:
             try:
-                data = socket.recv(size)
+                data = bytes_to_str(socket.recv(size))
             except IOError:
                 # catch instance halt connection refused errors
                 data = ''
@@ -119,7 +121,7 @@ def handle(self, socket, addr):
             color_log("DEBUG: test-run's response for [{}]\n{}\n".format(
                 line, prefix_each_line(' | ', result)),
                 schema='test-run command')
-            socket.sendall(result)
+            socket.sendall(str_to_bytes(result))
 
         self.sem.release()
 
diff --git a/lib/tarantool_server.py b/lib/tarantool_server.py
@@ -35,6 +35,7 @@
 from lib.server import Server
 from lib.server import DEFAULT_SNAPSHOT_NAME
 from lib.test import Test
+from lib.utils import bytes_to_str
 from lib.utils import find_port
 from lib.utils import extract_schema_from_snapshot
 from lib.utils import format_process
@@ -94,8 +95,8 @@ def result_file_version(self):
         if not os.path.isfile(self.result):
             return self.RESULT_FILE_VERSION_DEFAULT
 
-        with open(self.result, 'r') as f:
-            line = f.readline().rstrip('\n')
+        with open(self.result, 'rb') as f:
+            line = bytes_to_str(f.readline()).rstrip('\n')
 
             # An empty line or EOF.
             if not line:
@@ -577,7 +578,7 @@ def _iproto(self, port):
     @property
     def log_des(self):
         if not hasattr(self, '_log_des'):
-            self._log_des = open(self.logfile, 'a')
+            self._log_des = open(self.logfile, 'ab')
         return self._log_des
 
     @log_des.deleter
@@ -663,7 +664,7 @@ def __del__(self):
     @classmethod
     def version(cls):
         p = subprocess.Popen([cls.binary, "--version"], stdout=subprocess.PIPE)
-        version = p.stdout.read().rstrip()
+        version = bytes_to_str(p.stdout.read()).rstrip()
         p.wait()
         return version
 
@@ -1162,7 +1163,7 @@ def test_option_get(self, option_list_str, silent=False):
                                   cwd=self.vardir,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT).stdout.read()
-        return output
+        return bytes_to_str(output)
 
     def test_option(self, option_list_str):
         print(self.test_option_get(option_list_str))
diff --git a/lib/test.py b/lib/test.py
@@ -9,19 +9,14 @@
 from functools import partial
 from hashlib import md5
 
-try:
-    # Python 2
-    from StringIO import StringIO
-except ImportError:
-    # Python 3
-    from io import StringIO
-
 from lib import Options
 from lib.colorer import color_stdout
+from lib.utils import assert_bytes
 from lib.utils import non_empty_valgrind_logs
 from lib.utils import print_tail_n
 from lib.utils import print_unidiff as utils_print_unidiff
 from lib.utils import safe_makedirs
+from lib.utils import str_to_bytes
 from lib import pytap13
 
 
@@ -48,21 +43,17 @@ def __repr__(self):
 class FilteredStream:
     """Helper class to filter .result file output"""
     def __init__(self, filename):
-        #
-        # always open the output stream in line-buffered mode,
-        # to see partial results of a failed test
-        #
-        self.stream = open(filename, "w+", 1)
+        self.stream = open(filename, "wb+")
         self.filters = []
         self.inspector = None
 
-    def write(self, fragment):
-        """Apply all filters, then write result to the undelrying stream.
-        Do line-oriented filtering: the fragment doesn't have to represent
-        just one line."""
-        fragment_stream = StringIO(fragment)
+    def write_bytes(self, fragment):
+        """ The same as ``write()``, but accepts ``<bytes>`` as
+            input.
+        """
+        assert_bytes(fragment)
         skipped = False
-        for line in fragment_stream:
+        for line in fragment.splitlines(True):
             original_len = len(line.strip())
             for pattern, replacement in self.filters:
                 line = re.sub(pattern, replacement, line)
@@ -73,8 +64,20 @@ def write(self, fragment):
             if not skipped:
                 self.stream.write(line)
 
+    def write(self, fragment):
+        """ Apply all filters, then write result to the underlying
+            stream.
+
+            Do line-oriented filtering: the fragment doesn't have
+            to represent just one line.
+
+            Accepts ``<str>`` as input, just like the standard
+            ``sys.stdout.write()``.
+        """
+        self.write_bytes(str_to_bytes(fragment))
+
     def push_filter(self, pattern, replacement):
-        self.filters.append([pattern, replacement])
+        self.filters.append([str_to_bytes(pattern), str_to_bytes(replacement)])
 
     def pop_filter(self):
         self.filters.pop()
diff --git a/lib/unittest_server.py b/lib/unittest_server.py
@@ -16,7 +16,7 @@ def execute(self, server):
         server.current_test = self
         execs = server.prepare_args()
         proc = Popen(execs, cwd=server.vardir, stdout=PIPE, stderr=STDOUT)
-        sys.stdout.write(proc.communicate()[0])
+        sys.stdout.write_bytes(proc.communicate()[0])
 
 
 class UnittestServer(Server):
diff --git a/lib/utils.py b/lib/utils.py
@@ -22,6 +22,7 @@
 
 # Useful for very coarse version differentiation.
 PY3 = sys.version_info[0] == 3
+PY2 = sys.version_info[0] == 2
 
 if PY3:
     string_types = str,
@@ -312,3 +313,41 @@ def extract_schema_from_snapshot(snapshot_path):
             if res[0] == 'version':
                 return res
     return None
+
+
+def assert_bytes(b):
+    """ Ensure given value is <bytes>.
+    """
+    if type(b) != bytes:
+        raise ValueError('Internal error: expected {}, got {}: {}'.format(
+            str(bytes), str(type(b)), repr(b)))
+
+
+def assert_str(s):
+    """ Ensure given value is <str>.
+    """
+    if type(s) != str:
+        raise ValueError('Internal error: expected {}, got {}: {}'.format(
+            str(str), str(type(s)), repr(s)))
+
+
+def bytes_to_str(b):
+    """ Convert <bytes> to <str>.
+
+        No-op on Python 2.
+    """
+    assert_bytes(b)
+    if PY2:
+        return b
+    return b.decode('utf-8')
+
+
+def str_to_bytes(s):
+    """ Convert <str> to <bytes>.
+
+        No-op on Python 2.
+    """
+    assert_str(s)
+    if PY2:
+        return s
+    return s.encode('utf-8')