Merge pull request lingpy#127 from xrotwang/unifyFileAccess

Johann-Mattis List · Johann-Mattis List · commit 4a2bb04dcf3b · 2014-11-11T09:16:33.000+01:00
Unify file access
diff --git a/lingpy/data/derive.py b/lingpy/data/derive.py
@@ -19,66 +19,52 @@
 __date__="2014-11-10"
 
 
-import unicodedata
-from pickle import dump
 import os
-import codecs
 
-# lingpy imports
 from ..settings import rcParams
 from ..algorithm import misc
 from ..convert.strings import scorer2str
 from ..read import *
 from .. import cache
+from .. import util
 
 try:
     import networkx as nx
 except ImportError:
     print(rcParams['W_missing_module'].format("networkx"))
 
+
+def _read(filename, normalize=None):
+    res = {}
+    for line in util.read_text_file(filename, normalize=normalize, lines=True):
+        k, v = line.strip().split(' : ')
+        res[k] = v.split(', ')
+    return res
+
+
 def _import_sound_classes(filename):
     """
     Function imports individually defined sound classes from a text file and 
     creates a replacement dictionary from these sound classes.
     """
-
-    infile = codecs.open(filename,'r','utf-8')
-    data = []
-    for line in infile:
-        data.append(
-                unicodedata.normalize('NFC', line.strip()).split(' : ')
-                )
-
-    sc_dict = {}
-    for el1,el2 in data:
-        sc_dict[el1] = el2.split(', ')
-
     sc_repl_dict = {}
-    
-    for key in sc_dict.keys():
-        for value in sc_dict[key]:
+    for key, values in _read(filename, normalize='NFC').items():
+        for value in values:
             if rcParams['debug']: print(value,key)
             sc_repl_dict[value] = key
 
     return sc_repl_dict
 
+
 def _import_score_tree(filename):
     """
     Function imports score trees for a given range of sound classes and
     converts them into a graph.
     """
-    infile = codecs.open(filename,'r','utf-8')
     graph = nx.DiGraph()
-    data = []
-    for line in infile:
-        data.append(line.strip().split(' : '))
-    score_tree = {}
-    for el1,el2 in data:
-        score_tree[el1] = el2.split(', ')
-    for key in score_tree.keys():
-        graph.add_node(key, val=score_tree[key][0])
-    for key in score_tree.keys():
-        for value in score_tree[key][1:]:
+    for key, values in _read(filename).items():
+        graph.add_node(key, val=values[0])
+        for value in values[1:]:
             if value != '-':
                 node,weight = value.split(':')
                 graph.add_edge(key,node,weight=int(weight))
@@ -383,16 +369,12 @@ def _export_score_dict(score_dict):
 
     @todo: This function can be better ported to another file.
     """
-    
     letters = list(set([key[0] for key in score_dict.keys()]))
-    outfile = codecs.open('score_dict.csv','w','utf-8')
-    outfile.write('\t'+'\t'.join(letters)+'\n')
-    for letter1 in letters:
-        outfile.write(letter1)
-        for letter2 in letters:
-            outfile.write('\t' + str(score_dict[(letter1,letter2)]))
-        outfile.write('\n')
-    outfile.close()
+    rows = [['+'] + letters]
+    for l1 in letters:
+        rows.append([l1] + [str(score_dict[(l1, l2)]) for l2 in letters])
+    util.write_text_file('score_dict.csv', '\n'.join('\t'.join(row) for row in rows))
+
 
 def compile_model(
         model,
@@ -541,10 +523,8 @@ def compile_model(
         scorer = misc.ScoreDict(chars,matrix)
         
         # create the matrix file
-        f = codecs.open(os.path.join(new_path,'matrix'),'w','utf-8')
-        f.write(scorer2str(scorer))
-        f.close()
-    
+        util.write_text_file(os.path.join(new_path,'matrix'), scorer2str(scorer))
+
     if scorer:
         cache.dump(scorer, model+'.scorer')
         print("... successfully created the scorer.")
@@ -577,7 +557,6 @@ def compile_dvt(path=''):
     lingpy.data.model.Model
     lingpy.data.derive.compile_model
     """
-    
     print("[i] Compiling diacritics and vowels...")
 
     # get the path to the models
@@ -596,39 +575,24 @@ def compile_dvt(path=''):
                 'dvt_el'
                 )
     else:
-        pass
-
-    diacritics = codecs.open(
-            os.path.join(file_path,'diacritics'),
-            'r',
-            'utf-8'
-            ).read().replace('\n','').replace('-','')
-    vowels = codecs.open(
-            os.path.join(file_path,'vowels'),
-            'r',
-            'utf-8'
-            ).read().replace('\n','')
-
-    tones = codecs.open(
-            os.path.join(file_path,'tones'),
-            'r',
-            'utf-8'
-            ).read().replace('\n','')
-    
-    # normalize stuff
-    # TODO: this is potentially dangerous and it is important to decide whether
-    # TODO: switching to NFD might not be a better choice
-    diacritics = unicodedata.normalize("NFC", diacritics)
-    vowels = unicodedata.normalize("NFC", vowels)
-    vowels = ''.join([v for v in vowels if v not in diacritics])
-    tones = unicodedata.normalize("NFC", tones)
-    
-    dvt = (diacritics,vowels,tones)
+        file_path = path
+
+    def _read_string(name):
+        # normalize stuff
+        # TODO: this is potentially dangerous and it is important to decide whether
+        # TODO: switching to NFD might not be a better choice
+        return util.read_text_file(
+            os.path.join(file_path, name), normalize='NFC').replace('\n', '')
+
+    diacritics = _read_string('diacritics').replace('-', '')
+    vowels = ''.join([v for v in _read_string('vowels') if v not in diacritics])
+    tones = _read_string('tones')
+
+    dvt = (diacritics, vowels, tones)
     
     if path in ['evolaemp', 'el']:
         cache.dump(dvt, 'dvt_el')
     else:
         cache.dump(dvt, 'dvt')
 
     if rcParams['verbose']: print("[i] Diacritics and sound classes were successfully compiled.")
-
diff --git a/lingpy/tests/data/__init__.py b/lingpy/tests/data/__init__.py
diff --git a/lingpy/tests/data/test_derive.py b/lingpy/tests/data/test_derive.py
@@ -0,0 +1,47 @@
+# *-* coding: utf-8 *-*
+from __future__ import unicode_literals, print_function, division, absolute_import
+
+from lingpy.tests.util import WithTempDir
+from lingpy import cache
+
+
+SCORER = """\
+p : c, -
+b : c, -
+f : c, -
+v : c, -
+m : v, -
+w : v, -
+8 : t, -"""
+
+
+class TestDerive(WithTempDir):
+    def setUp(self):
+        WithTempDir.setUp(self)
+        self.tmp_path('_test').mkdir()
+        with self.tmp_path('_test', 'converter').open(mode='w', encoding='utf8') as fp:
+            fp.write("""\
+p : p, ɸ, p͡f
+b : b, β, b͡v
+f : f
+v : v
+m : m, ɱ
+w : w, ɰ, ʋ, ʍ
+8 : θ, θ, Ɵ, ð""")
+
+        with self.tmp_path('_test', 'scorer').open(mode='w', encoding='utf8') as fp:
+            fp.write(SCORER)
+
+    def test_compile_model(self):
+        from lingpy.data.derive import compile_model
+
+        compile_model('_test', self.tmp)
+        sound_classes = cache.load('_test.converter')
+        self.assertEqual(sound_classes['b'], 'b')
+        self.assertTrue(self.tmp_path('_test', 'matrix').exists())
+
+    def test_compile_dvt(self):
+        from lingpy.data.derive import compile_dvt
+
+        compile_dvt()
+        self.assertEqual(len(cache.load('dvt')), 3)
diff --git a/lingpy/tests/util.py b/lingpy/tests/util.py
@@ -1,5 +1,10 @@
 """Utilities used in lingpy tests"""
 import os
+import unittest
+from tempfile import mkdtemp
+import shutil
+
+from pathlib import Path
 
 
 def test_data(*comps):
@@ -9,3 +14,14 @@ def test_data(*comps):
     :return: Absolute path to the specified test data file.
     """
     return os.path.join(os.path.abspath(os.path.dirname(__file__)), 'test_data', *comps)
+
+
+class WithTempDir(unittest.TestCase):
+    def setUp(self):
+        self.tmp = mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp, ignore_errors=True)
+
+    def tmp_path(self, *comps):
+        return Path(self.tmp).joinpath(*comps)
diff --git a/lingpy/util.py b/lingpy/util.py
@@ -0,0 +1,34 @@
+import io
+import unicodedata
+
+from pathlib import Path
+from six import text_type
+
+
+def _str_path(path):
+    return text_type(path) if isinstance(path, Path) else path
+
+
+def write_text_file(path, content):
+    with io.open(_str_path(path), 'w', encoding='utf8') as fp:
+        fp.write(content)
+
+
+def read_text_file(path, normalize=None, lines=False):
+    """Reads a text file encoded in utf-8.
+
+    :param path: File-system path of the file.
+    :param normalize: If not `None` a valid unicode normalization mode must be passed.
+    :param lines: Flag signalling whether to return a list of lines.
+    :return: File content as unicode object or list of lines as unicode objects.
+
+    .. note:: The whole file is read into memory.
+    """
+    def _normalize(chunk):
+        return unicodedata.normalize(normalize, chunk) if normalize else chunk
+
+    with io.open(_str_path(path), 'r', encoding='utf8') as fp:
+        if lines:
+            return [_normalize(line) for line in fp]
+        else:
+            return _normalize(fp.read())