Skip to content

Commit 7bee1cf

Browse files
author
josh
committed
train/test/dev checking now made explicit
1 parent 9810946 commit 7bee1cf

File tree

1 file changed

+8
-11
lines changed

1 file changed

+8
-11
lines changed

bin/import_cv2.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,13 @@
1111
import subprocess
1212
import progressbar
1313

14-
from glob import glob
1514
from os import path
1615
from sox import Transformer
1716
from threading import RLock
1817
from multiprocessing.dummy import Pool
1918
from multiprocessing import cpu_count
2019
from util.downloader import SIMPLE_BAR
2120

22-
2321
'''
2422
Broadly speaking, this script takes the audio downloaded from Common Voice
2523
for a certain language, in addition to the *.tsv files output by CorporaCeator,
@@ -31,7 +29,8 @@
3129
3230
Input:
3331
(1) audio_dir (string) path to dir of audio downloaded from Common Voice
34-
(2) tsv_dir (string) path to dir containing tsv files generated by CorporaCreator
32+
(2) tsv_dir (string) path to dir containing {train,test,dev}.tsv files
33+
which were generated by CorporaCreator
3534
3635
Ouput:
3736
(1) csv files in format needed by DeepSpeech.py, saved into audio_dir
@@ -43,15 +42,13 @@
4342
MAX_SECS = 10
4443

4544
def _preprocess_data(audio_dir, tsv_dir):
46-
try:
47-
# Check if there is at least one TSV file in tsv_dir
48-
os.path.isfile(glob(path.join(path.abspath(tsv_dir), '*.tsv'))[0])
49-
for input_tsv in glob(path.join(path.abspath(tsv_dir), '*.tsv')):
50-
print("Loading in TSV file: ", input_tsv)
45+
for dataset in ['train','test','dev']:
46+
input_tsv= path.join(path.abspath(tsv_dir), dataset+".tsv")
47+
if os.path.isfile(input_tsv):
48+
print("Loading TSV file: ", input_tsv)
5149
_maybe_convert_set(audio_dir, input_tsv)
52-
except IndexError:
53-
print("ERROR: no TSV file found in: ", tsv_dir)
54-
50+
else:
51+
print("ERROR: no TSV file found: ", input_tsv)
5552

5653
def _maybe_convert_set(audio_dir, input_tsv):
5754
output_csv = path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv'))

0 commit comments

Comments
 (0)