|
11 | 11 | import subprocess
|
12 | 12 | import progressbar
|
13 | 13 |
|
14 |
| -from glob import glob |
15 | 14 | from os import path
|
16 | 15 | from sox import Transformer
|
17 | 16 | from threading import RLock
|
18 | 17 | from multiprocessing.dummy import Pool
|
19 | 18 | from multiprocessing import cpu_count
|
20 | 19 | from util.downloader import SIMPLE_BAR
|
21 | 20 |
|
22 |
| - |
23 | 21 | '''
|
24 | 22 | Broadly speaking, this script takes the audio downloaded from Common Voice
|
25 | 23 | for a certain language, in addition to the *.tsv files output by CorporaCeator,
|
|
31 | 29 |
|
32 | 30 | Input:
|
33 | 31 | (1) audio_dir (string) path to dir of audio downloaded from Common Voice
|
34 |
| - (2) tsv_dir (string) path to dir containing tsv files generated by CorporaCreator |
| 32 | + (2) tsv_dir (string) path to dir containing {train,test,dev}.tsv files |
| 33 | + which were generated by CorporaCreator |
35 | 34 |
|
36 | 35 | Ouput:
|
37 | 36 | (1) csv files in format needed by DeepSpeech.py, saved into audio_dir
|
|
43 | 42 | MAX_SECS = 10
|
44 | 43 |
|
45 | 44 | def _preprocess_data(audio_dir, tsv_dir):
|
46 |
| - try: |
47 |
| - # Check if there is at least one TSV file in tsv_dir |
48 |
| - os.path.isfile(glob(path.join(path.abspath(tsv_dir), '*.tsv'))[0]) |
49 |
| - for input_tsv in glob(path.join(path.abspath(tsv_dir), '*.tsv')): |
50 |
| - print("Loading in TSV file: ", input_tsv) |
| 45 | + for dataset in ['train','test','dev']: |
| 46 | + input_tsv= path.join(path.abspath(tsv_dir), dataset+".tsv") |
| 47 | + if os.path.isfile(input_tsv): |
| 48 | + print("Loading TSV file: ", input_tsv) |
51 | 49 | _maybe_convert_set(audio_dir, input_tsv)
|
52 |
| - except IndexError: |
53 |
| - print("ERROR: no TSV file found in: ", tsv_dir) |
54 |
| - |
| 50 | + else: |
| 51 | + print("ERROR: no TSV file found: ", input_tsv) |
55 | 52 |
|
56 | 53 | def _maybe_convert_set(audio_dir, input_tsv):
|
57 | 54 | output_csv = path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv'))
|
|
0 commit comments