19
19
__date__ = "2014-11-10"
20
20
21
21
22
- import unicodedata
23
- from pickle import dump
24
22
import os
25
- import codecs
26
23
27
- # lingpy imports
28
24
from ..settings import rcParams
29
25
from ..algorithm import misc
30
26
from ..convert .strings import scorer2str
31
27
from ..read import *
32
28
from .. import cache
29
+ from .. import util
33
30
34
31
try :
35
32
import networkx as nx
36
33
except ImportError :
37
34
print (rcParams ['W_missing_module' ].format ("networkx" ))
38
35
36
+
37
+ def _read (filename , normalize = None ):
38
+ res = {}
39
+ for line in util .read_text_file (filename , normalize = normalize , lines = True ):
40
+ k , v = line .strip ().split (' : ' )
41
+ res [k ] = v .split (', ' )
42
+ return res
43
+
44
+
39
45
def _import_sound_classes (filename ):
40
46
"""
41
47
Function imports individually defined sound classes from a text file and
42
48
creates a replacement dictionary from these sound classes.
43
49
"""
44
-
45
- infile = codecs .open (filename ,'r' ,'utf-8' )
46
- data = []
47
- for line in infile :
48
- data .append (
49
- unicodedata .normalize ('NFC' , line .strip ()).split (' : ' )
50
- )
51
-
52
- sc_dict = {}
53
- for el1 ,el2 in data :
54
- sc_dict [el1 ] = el2 .split (', ' )
55
-
56
50
sc_repl_dict = {}
57
-
58
- for key in sc_dict .keys ():
59
- for value in sc_dict [key ]:
51
+ for key , values in _read (filename , normalize = 'NFC' ).items ():
52
+ for value in values :
60
53
if rcParams ['debug' ]: print (value ,key )
61
54
sc_repl_dict [value ] = key
62
55
63
56
return sc_repl_dict
64
57
58
+
65
59
def _import_score_tree (filename ):
66
60
"""
67
61
Function imports score trees for a given range of sound classes and
68
62
converts them into a graph.
69
63
"""
70
- infile = codecs .open (filename ,'r' ,'utf-8' )
71
64
graph = nx .DiGraph ()
72
- data = []
73
- for line in infile :
74
- data .append (line .strip ().split (' : ' ))
75
- score_tree = {}
76
- for el1 ,el2 in data :
77
- score_tree [el1 ] = el2 .split (', ' )
78
- for key in score_tree .keys ():
79
- graph .add_node (key , val = score_tree [key ][0 ])
80
- for key in score_tree .keys ():
81
- for value in score_tree [key ][1 :]:
65
+ for key , values in _read (filename ).items ():
66
+ graph .add_node (key , val = values [0 ])
67
+ for value in values [1 :]:
82
68
if value != '-' :
83
69
node ,weight = value .split (':' )
84
70
graph .add_edge (key ,node ,weight = int (weight ))
@@ -383,16 +369,12 @@ def _export_score_dict(score_dict):
383
369
384
370
@todo: This function can be better ported to another file.
385
371
"""
386
-
387
372
letters = list (set ([key [0 ] for key in score_dict .keys ()]))
388
- outfile = codecs .open ('score_dict.csv' ,'w' ,'utf-8' )
389
- outfile .write ('\t ' + '\t ' .join (letters )+ '\n ' )
390
- for letter1 in letters :
391
- outfile .write (letter1 )
392
- for letter2 in letters :
393
- outfile .write ('\t ' + str (score_dict [(letter1 ,letter2 )]))
394
- outfile .write ('\n ' )
395
- outfile .close ()
373
+ rows = [['+' ] + letters ]
374
+ for l1 in letters :
375
+ rows .append ([l1 ] + [str (score_dict [(l1 , l2 )]) for l2 in letters ])
376
+ util .write_text_file ('score_dict.csv' , '\n ' .join ('\t ' .join (row ) for row in rows ))
377
+
396
378
397
379
def compile_model (
398
380
model ,
@@ -541,10 +523,8 @@ def compile_model(
541
523
scorer = misc .ScoreDict (chars ,matrix )
542
524
543
525
# create the matrix file
544
- f = codecs .open (os .path .join (new_path ,'matrix' ),'w' ,'utf-8' )
545
- f .write (scorer2str (scorer ))
546
- f .close ()
547
-
526
+ util .write_text_file (os .path .join (new_path ,'matrix' ), scorer2str (scorer ))
527
+
548
528
if scorer :
549
529
cache .dump (scorer , model + '.scorer' )
550
530
print ("... successfully created the scorer." )
@@ -577,7 +557,6 @@ def compile_dvt(path=''):
577
557
lingpy.data.model.Model
578
558
lingpy.data.derive.compile_model
579
559
"""
580
-
581
560
print ("[i] Compiling diacritics and vowels..." )
582
561
583
562
# get the path to the models
@@ -596,39 +575,24 @@ def compile_dvt(path=''):
596
575
'dvt_el'
597
576
)
598
577
else :
599
- pass
600
-
601
- diacritics = codecs .open (
602
- os .path .join (file_path ,'diacritics' ),
603
- 'r' ,
604
- 'utf-8'
605
- ).read ().replace ('\n ' ,'' ).replace ('-' ,'' )
606
- vowels = codecs .open (
607
- os .path .join (file_path ,'vowels' ),
608
- 'r' ,
609
- 'utf-8'
610
- ).read ().replace ('\n ' ,'' )
611
-
612
- tones = codecs .open (
613
- os .path .join (file_path ,'tones' ),
614
- 'r' ,
615
- 'utf-8'
616
- ).read ().replace ('\n ' ,'' )
617
-
618
- # normalize stuff
619
- # TODO: this is potentially dangerous and it is important to decide whether
620
- # TODO: switching to NFD might not be a better choice
621
- diacritics = unicodedata .normalize ("NFC" , diacritics )
622
- vowels = unicodedata .normalize ("NFC" , vowels )
623
- vowels = '' .join ([v for v in vowels if v not in diacritics ])
624
- tones = unicodedata .normalize ("NFC" , tones )
625
-
626
- dvt = (diacritics ,vowels ,tones )
578
+ file_path = path
579
+
580
+ def _read_string (name ):
581
+ # normalize stuff
582
+ # TODO: this is potentially dangerous and it is important to decide whether
583
+ # TODO: switching to NFD might not be a better choice
584
+ return util .read_text_file (
585
+ os .path .join (file_path , name ), normalize = 'NFC' ).replace ('\n ' , '' )
586
+
587
+ diacritics = _read_string ('diacritics' ).replace ('-' , '' )
588
+ vowels = '' .join ([v for v in _read_string ('vowels' ) if v not in diacritics ])
589
+ tones = _read_string ('tones' )
590
+
591
+ dvt = (diacritics , vowels , tones )
627
592
628
593
if path in ['evolaemp' , 'el' ]:
629
594
cache .dump (dvt , 'dvt_el' )
630
595
else :
631
596
cache .dump (dvt , 'dvt' )
632
597
633
598
if rcParams ['verbose' ]: print ("[i] Diacritics and sound classes were successfully compiled." )
634
-
0 commit comments