Skip to content

Commit 4a2bb04

Browse files
author
Johann-Mattis List
committed
Merge pull request lingpy#127 from xrotwang/unifyFileAccess
Unify file access
2 parents 852d231 + 81f2f1b commit 4a2bb04

File tree

5 files changed

+134
-73
lines changed

5 files changed

+134
-73
lines changed

lingpy/data/derive.py

Lines changed: 37 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -19,66 +19,52 @@
1919
__date__="2014-11-10"
2020

2121

22-
import unicodedata
23-
from pickle import dump
2422
import os
25-
import codecs
2623

27-
# lingpy imports
2824
from ..settings import rcParams
2925
from ..algorithm import misc
3026
from ..convert.strings import scorer2str
3127
from ..read import *
3228
from .. import cache
29+
from .. import util
3330

3431
try:
3532
import networkx as nx
3633
except ImportError:
3734
print(rcParams['W_missing_module'].format("networkx"))
3835

36+
37+
def _read(filename, normalize=None):
38+
res = {}
39+
for line in util.read_text_file(filename, normalize=normalize, lines=True):
40+
k, v = line.strip().split(' : ')
41+
res[k] = v.split(', ')
42+
return res
43+
44+
3945
def _import_sound_classes(filename):
4046
"""
4147
Function imports individually defined sound classes from a text file and
4248
creates a replacement dictionary from these sound classes.
4349
"""
44-
45-
infile = codecs.open(filename,'r','utf-8')
46-
data = []
47-
for line in infile:
48-
data.append(
49-
unicodedata.normalize('NFC', line.strip()).split(' : ')
50-
)
51-
52-
sc_dict = {}
53-
for el1,el2 in data:
54-
sc_dict[el1] = el2.split(', ')
55-
5650
sc_repl_dict = {}
57-
58-
for key in sc_dict.keys():
59-
for value in sc_dict[key]:
51+
for key, values in _read(filename, normalize='NFC').items():
52+
for value in values:
6053
if rcParams['debug']: print(value,key)
6154
sc_repl_dict[value] = key
6255

6356
return sc_repl_dict
6457

58+
6559
def _import_score_tree(filename):
6660
"""
6761
Function imports score trees for a given range of sound classes and
6862
converts them into a graph.
6963
"""
70-
infile = codecs.open(filename,'r','utf-8')
7164
graph = nx.DiGraph()
72-
data = []
73-
for line in infile:
74-
data.append(line.strip().split(' : '))
75-
score_tree = {}
76-
for el1,el2 in data:
77-
score_tree[el1] = el2.split(', ')
78-
for key in score_tree.keys():
79-
graph.add_node(key, val=score_tree[key][0])
80-
for key in score_tree.keys():
81-
for value in score_tree[key][1:]:
65+
for key, values in _read(filename).items():
66+
graph.add_node(key, val=values[0])
67+
for value in values[1:]:
8268
if value != '-':
8369
node,weight = value.split(':')
8470
graph.add_edge(key,node,weight=int(weight))
@@ -383,16 +369,12 @@ def _export_score_dict(score_dict):
383369
384370
@todo: This function can be better ported to another file.
385371
"""
386-
387372
letters = list(set([key[0] for key in score_dict.keys()]))
388-
outfile = codecs.open('score_dict.csv','w','utf-8')
389-
outfile.write('\t'+'\t'.join(letters)+'\n')
390-
for letter1 in letters:
391-
outfile.write(letter1)
392-
for letter2 in letters:
393-
outfile.write('\t' + str(score_dict[(letter1,letter2)]))
394-
outfile.write('\n')
395-
outfile.close()
373+
rows = [['+'] + letters]
374+
for l1 in letters:
375+
rows.append([l1] + [str(score_dict[(l1, l2)]) for l2 in letters])
376+
util.write_text_file('score_dict.csv', '\n'.join('\t'.join(row) for row in rows))
377+
396378

397379
def compile_model(
398380
model,
@@ -541,10 +523,8 @@ def compile_model(
541523
scorer = misc.ScoreDict(chars,matrix)
542524

543525
# create the matrix file
544-
f = codecs.open(os.path.join(new_path,'matrix'),'w','utf-8')
545-
f.write(scorer2str(scorer))
546-
f.close()
547-
526+
util.write_text_file(os.path.join(new_path,'matrix'), scorer2str(scorer))
527+
548528
if scorer:
549529
cache.dump(scorer, model+'.scorer')
550530
print("... successfully created the scorer.")
@@ -577,7 +557,6 @@ def compile_dvt(path=''):
577557
lingpy.data.model.Model
578558
lingpy.data.derive.compile_model
579559
"""
580-
581560
print("[i] Compiling diacritics and vowels...")
582561

583562
# get the path to the models
@@ -596,39 +575,24 @@ def compile_dvt(path=''):
596575
'dvt_el'
597576
)
598577
else:
599-
pass
600-
601-
diacritics = codecs.open(
602-
os.path.join(file_path,'diacritics'),
603-
'r',
604-
'utf-8'
605-
).read().replace('\n','').replace('-','')
606-
vowels = codecs.open(
607-
os.path.join(file_path,'vowels'),
608-
'r',
609-
'utf-8'
610-
).read().replace('\n','')
611-
612-
tones = codecs.open(
613-
os.path.join(file_path,'tones'),
614-
'r',
615-
'utf-8'
616-
).read().replace('\n','')
617-
618-
# normalize stuff
619-
# TODO: this is potentially dangerous and it is important to decide whether
620-
# TODO: switching to NFD might not be a better choice
621-
diacritics = unicodedata.normalize("NFC", diacritics)
622-
vowels = unicodedata.normalize("NFC", vowels)
623-
vowels = ''.join([v for v in vowels if v not in diacritics])
624-
tones = unicodedata.normalize("NFC", tones)
625-
626-
dvt = (diacritics,vowels,tones)
578+
file_path = path
579+
580+
def _read_string(name):
581+
# normalize stuff
582+
# TODO: this is potentially dangerous and it is important to decide whether
583+
# TODO: switching to NFD might not be a better choice
584+
return util.read_text_file(
585+
os.path.join(file_path, name), normalize='NFC').replace('\n', '')
586+
587+
diacritics = _read_string('diacritics').replace('-', '')
588+
vowels = ''.join([v for v in _read_string('vowels') if v not in diacritics])
589+
tones = _read_string('tones')
590+
591+
dvt = (diacritics, vowels, tones)
627592

628593
if path in ['evolaemp', 'el']:
629594
cache.dump(dvt, 'dvt_el')
630595
else:
631596
cache.dump(dvt, 'dvt')
632597

633598
if rcParams['verbose']: print("[i] Diacritics and sound classes were successfully compiled.")
634-

lingpy/tests/data/__init__.py

Whitespace-only changes.

lingpy/tests/data/test_derive.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# *-* coding: utf-8 *-*
2+
from __future__ import unicode_literals, print_function, division, absolute_import
3+
4+
from lingpy.tests.util import WithTempDir
5+
from lingpy import cache
6+
7+
8+
SCORER = """\
9+
p : c, -
10+
b : c, -
11+
f : c, -
12+
v : c, -
13+
m : v, -
14+
w : v, -
15+
8 : t, -"""
16+
17+
18+
class TestDerive(WithTempDir):
19+
def setUp(self):
20+
WithTempDir.setUp(self)
21+
self.tmp_path('_test').mkdir()
22+
with self.tmp_path('_test', 'converter').open(mode='w', encoding='utf8') as fp:
23+
fp.write("""\
24+
p : p, ɸ, p͡f
25+
b : b, β, b͡v
26+
f : f
27+
v : v
28+
m : m, ɱ
29+
w : w, ɰ, ʋ, ʍ
30+
8 : θ, θ, Ɵ, ð""")
31+
32+
with self.tmp_path('_test', 'scorer').open(mode='w', encoding='utf8') as fp:
33+
fp.write(SCORER)
34+
35+
def test_compile_model(self):
36+
from lingpy.data.derive import compile_model
37+
38+
compile_model('_test', self.tmp)
39+
sound_classes = cache.load('_test.converter')
40+
self.assertEqual(sound_classes['b'], 'b')
41+
self.assertTrue(self.tmp_path('_test', 'matrix').exists())
42+
43+
def test_compile_dvt(self):
44+
from lingpy.data.derive import compile_dvt
45+
46+
compile_dvt()
47+
self.assertEqual(len(cache.load('dvt')), 3)

lingpy/tests/util.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
"""Utilities used in lingpy tests"""
22
import os
3+
import unittest
4+
from tempfile import mkdtemp
5+
import shutil
6+
7+
from pathlib import Path
38

49

510
def test_data(*comps):
@@ -9,3 +14,14 @@ def test_data(*comps):
914
:return: Absolute path to the specified test data file.
1015
"""
1116
return os.path.join(os.path.abspath(os.path.dirname(__file__)), 'test_data', *comps)
17+
18+
19+
class WithTempDir(unittest.TestCase):
20+
def setUp(self):
21+
self.tmp = mkdtemp()
22+
23+
def tearDown(self):
24+
shutil.rmtree(self.tmp, ignore_errors=True)
25+
26+
def tmp_path(self, *comps):
27+
return Path(self.tmp).joinpath(*comps)

lingpy/util.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import io
2+
import unicodedata
3+
4+
from pathlib import Path
5+
from six import text_type
6+
7+
8+
def _str_path(path):
9+
return text_type(path) if isinstance(path, Path) else path
10+
11+
12+
def write_text_file(path, content):
13+
with io.open(_str_path(path), 'w', encoding='utf8') as fp:
14+
fp.write(content)
15+
16+
17+
def read_text_file(path, normalize=None, lines=False):
18+
"""Reads a text file encoded in utf-8.
19+
20+
:param path: File-system path of the file.
21+
:param normalize: If not `None` a valid unicode normalization mode must be passed.
22+
:param lines: Flag signalling whether to return a list of lines.
23+
:return: File content as unicode object or list of lines as unicode objects.
24+
25+
.. note:: The whole file is read into memory.
26+
"""
27+
def _normalize(chunk):
28+
return unicodedata.normalize(normalize, chunk) if normalize else chunk
29+
30+
with io.open(_str_path(path), 'r', encoding='utf8') as fp:
31+
if lines:
32+
return [_normalize(line) for line in fp]
33+
else:
34+
return _normalize(fp.read())

0 commit comments

Comments
 (0)