Merge pull request #1860 from mozilla/import_cv

JRMeyer · web-flow · commit 3378008f5d23 · 2019-02-01T16:27:10.000+01:00
Import cv
diff --git a/bin/import_cv2.py b/bin/import_cv2.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+from __future__ import absolute_import, division, print_function
+
+# Make sure we can import stuff from util/
+# This script needs to be run from the root of the DeepSpeech repository
+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+
+import csv
+import subprocess
+import progressbar
+
+from os import path
+from sox import Transformer
+from threading import RLock
+from multiprocessing.dummy import Pool
+from multiprocessing import cpu_count
+from util.downloader import SIMPLE_BAR
+
+'''
+Broadly speaking, this script takes the audio downloaded from Common Voice
+for a certain language, in addition to the *.tsv files output by CorporaCeator,
+and the script formats the data and transcripts to be in a state usable by
+DeepSpeech.py
+
+Usage:
+        $ python3 import_cv2.py /path/to/audio/data_dir /path/to/tsv_dir
+
+Input: 
+        (1) audio_dir (string) path to dir of audio downloaded from Common Voice
+        (2) tsv_dir (string) path to dir containing {train,test,dev}.tsv files 
+            which were generated by CorporaCreator
+
+Ouput:
+        (1) csv files in format needed by DeepSpeech.py, saved into audio_dir
+        (2) wav files, saved into audio_dir alongside their mp3s
+'''
+
+FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript']
+SAMPLE_RATE = 16000
+MAX_SECS = 10
+
+def _preprocess_data(audio_dir, tsv_dir):
+    for dataset in ['train','test','dev']:
+        input_tsv= path.join(path.abspath(tsv_dir), dataset+".tsv")
+        if os.path.isfile(input_tsv):
+            print("Loading TSV file: ", input_tsv)
+            _maybe_convert_set(audio_dir, input_tsv)
+        else:
+            print("ERROR: no TSV file found: ", input_tsv)
+
+def _maybe_convert_set(audio_dir, input_tsv):
+    output_csv =  path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv'))
+    print("Saving new DeepSpeech-formatted CSV file to: ", output_csv)
+    
+    # Get audiofile path and transcript for each sentence in tsv
+    samples = []
+    with open(input_tsv) as input_tsv_file:
+        reader = csv.DictReader(input_tsv_file, delimiter='\t')
+        for row in reader:
+            samples.append((row['path'], row['sentence']))
+            
+    # Keep track of how many samples are good vs. problematic
+    counter = { 'all': 0, 'too_short': 0, 'too_long': 0 }
+    lock = RLock()
+    num_samples = len(samples)
+    rows = []
+    
+    def one_sample(sample):
+        """ Take a audio file, and optionally convert it to 16kHz WAV """
+        mp3_filename = path.join(audio_dir, sample[0])
+        # Storing wav files next to the mp3 ones - just with a different suffix
+        wav_filename = path.splitext(mp3_filename)[0] + ".wav"
+        _maybe_convert_wav(mp3_filename, wav_filename)
+        frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
+        file_size = path.getsize(wav_filename)
+        with lock:
+            if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])):
+                # Excluding samples that are too short to fit the transcript
+                counter['too_short'] += 1
+            elif frames/SAMPLE_RATE > MAX_SECS:
+                # Excluding very long samples to keep a reasonable batch-size
+                counter['too_long'] += 1
+            else:
+                # This one is good - keep it for the target CSV
+                rows.append((wav_filename, file_size, sample[1]))
+            counter['all'] += 1
+            
+    print("Importing mp3 files...")
+    pool = Pool(cpu_count())
+    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
+    for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1):
+        bar.update(i)
+    bar.update(num_samples)
+    pool.close()
+    pool.join()
+    
+    with open(output_csv, 'w') as output_csv_file:
+        print('Writing CSV file for DeepSpeech.py as: ', output_csv)
+        writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
+        writer.writeheader()
+        bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
+        for filename, file_size, transcript in bar(rows):
+            writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })
+            
+    print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long']))
+    if counter['too_short'] > 0:
+        print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
+    if counter['too_long'] > 0:
+        print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
+
+def _maybe_convert_wav(mp3_filename, wav_filename):
+    if not path.exists(wav_filename):
+        transformer = Transformer()
+        transformer.convert(samplerate=SAMPLE_RATE)
+        transformer.build(mp3_filename, wav_filename)
+
+if __name__ == "__main__":
+    audio_dir = sys.argv[1]
+    tsv_dir = sys.argv[2]
+    print('Expecting your audio from Common Voice to be in: ', audio_dir)
+    print('Looking for *.tsv files (generated by CorporaCreator) in: ', tsv_dir)
+    _preprocess_data(audio_dir, tsv_dir)