Skip to content

Commit 3378008

Browse files
authored
Merge pull request #1860 from mozilla/import_cv
Import cv
2 parents e3bda9e + 7bee1cf commit 3378008

File tree

1 file changed

+124
-0
lines changed

1 file changed

+124
-0
lines changed

bin/import_cv2.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
#!/usr/bin/env python
2+
from __future__ import absolute_import, division, print_function
3+
4+
# Make sure we can import stuff from util/
5+
# This script needs to be run from the root of the DeepSpeech repository
6+
import os
7+
import sys
8+
sys.path.insert(1, os.path.join(sys.path[0], '..'))
9+
10+
import csv
11+
import subprocess
12+
import progressbar
13+
14+
from os import path
15+
from sox import Transformer
16+
from threading import RLock
17+
from multiprocessing.dummy import Pool
18+
from multiprocessing import cpu_count
19+
from util.downloader import SIMPLE_BAR
20+
21+
'''
22+
Broadly speaking, this script takes the audio downloaded from Common Voice
23+
for a certain language, in addition to the *.tsv files output by CorporaCeator,
24+
and the script formats the data and transcripts to be in a state usable by
25+
DeepSpeech.py
26+
27+
Usage:
28+
$ python3 import_cv2.py /path/to/audio/data_dir /path/to/tsv_dir
29+
30+
Input:
31+
(1) audio_dir (string) path to dir of audio downloaded from Common Voice
32+
(2) tsv_dir (string) path to dir containing {train,test,dev}.tsv files
33+
which were generated by CorporaCreator
34+
35+
Ouput:
36+
(1) csv files in format needed by DeepSpeech.py, saved into audio_dir
37+
(2) wav files, saved into audio_dir alongside their mp3s
38+
'''
39+
40+
FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript']
41+
SAMPLE_RATE = 16000
42+
MAX_SECS = 10
43+
44+
def _preprocess_data(audio_dir, tsv_dir):
45+
for dataset in ['train','test','dev']:
46+
input_tsv= path.join(path.abspath(tsv_dir), dataset+".tsv")
47+
if os.path.isfile(input_tsv):
48+
print("Loading TSV file: ", input_tsv)
49+
_maybe_convert_set(audio_dir, input_tsv)
50+
else:
51+
print("ERROR: no TSV file found: ", input_tsv)
52+
53+
def _maybe_convert_set(audio_dir, input_tsv):
54+
output_csv = path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv'))
55+
print("Saving new DeepSpeech-formatted CSV file to: ", output_csv)
56+
57+
# Get audiofile path and transcript for each sentence in tsv
58+
samples = []
59+
with open(input_tsv) as input_tsv_file:
60+
reader = csv.DictReader(input_tsv_file, delimiter='\t')
61+
for row in reader:
62+
samples.append((row['path'], row['sentence']))
63+
64+
# Keep track of how many samples are good vs. problematic
65+
counter = { 'all': 0, 'too_short': 0, 'too_long': 0 }
66+
lock = RLock()
67+
num_samples = len(samples)
68+
rows = []
69+
70+
def one_sample(sample):
71+
""" Take a audio file, and optionally convert it to 16kHz WAV """
72+
mp3_filename = path.join(audio_dir, sample[0])
73+
# Storing wav files next to the mp3 ones - just with a different suffix
74+
wav_filename = path.splitext(mp3_filename)[0] + ".wav"
75+
_maybe_convert_wav(mp3_filename, wav_filename)
76+
frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
77+
file_size = path.getsize(wav_filename)
78+
with lock:
79+
if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])):
80+
# Excluding samples that are too short to fit the transcript
81+
counter['too_short'] += 1
82+
elif frames/SAMPLE_RATE > MAX_SECS:
83+
# Excluding very long samples to keep a reasonable batch-size
84+
counter['too_long'] += 1
85+
else:
86+
# This one is good - keep it for the target CSV
87+
rows.append((wav_filename, file_size, sample[1]))
88+
counter['all'] += 1
89+
90+
print("Importing mp3 files...")
91+
pool = Pool(cpu_count())
92+
bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
93+
for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1):
94+
bar.update(i)
95+
bar.update(num_samples)
96+
pool.close()
97+
pool.join()
98+
99+
with open(output_csv, 'w') as output_csv_file:
100+
print('Writing CSV file for DeepSpeech.py as: ', output_csv)
101+
writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
102+
writer.writeheader()
103+
bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
104+
for filename, file_size, transcript in bar(rows):
105+
writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })
106+
107+
print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long']))
108+
if counter['too_short'] > 0:
109+
print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
110+
if counter['too_long'] > 0:
111+
print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
112+
113+
def _maybe_convert_wav(mp3_filename, wav_filename):
114+
if not path.exists(wav_filename):
115+
transformer = Transformer()
116+
transformer.convert(samplerate=SAMPLE_RATE)
117+
transformer.build(mp3_filename, wav_filename)
118+
119+
if __name__ == "__main__":
120+
audio_dir = sys.argv[1]
121+
tsv_dir = sys.argv[2]
122+
print('Expecting your audio from Common Voice to be in: ', audio_dir)
123+
print('Looking for *.tsv files (generated by CorporaCreator) in: ', tsv_dir)
124+
_preprocess_data(audio_dir, tsv_dir)

0 commit comments

Comments
 (0)