|
| 1 | +#!/usr/bin/env python |
| 2 | +from __future__ import absolute_import, division, print_function |
| 3 | + |
| 4 | +# Make sure we can import stuff from util/ |
| 5 | +# This script needs to be run from the root of the DeepSpeech repository |
| 6 | +import os |
| 7 | +import sys |
| 8 | +sys.path.insert(1, os.path.join(sys.path[0], '..')) |
| 9 | + |
| 10 | +import csv |
| 11 | +import subprocess |
| 12 | +import progressbar |
| 13 | + |
| 14 | +from os import path |
| 15 | +from sox import Transformer |
| 16 | +from threading import RLock |
| 17 | +from multiprocessing.dummy import Pool |
| 18 | +from multiprocessing import cpu_count |
| 19 | +from util.downloader import SIMPLE_BAR |
| 20 | + |
| 21 | +''' |
| 22 | +Broadly speaking, this script takes the audio downloaded from Common Voice |
| 23 | +for a certain language, in addition to the *.tsv files output by CorporaCeator, |
| 24 | +and the script formats the data and transcripts to be in a state usable by |
| 25 | +DeepSpeech.py |
| 26 | + |
| 27 | +Usage: |
| 28 | + $ python3 import_cv2.py /path/to/audio/data_dir /path/to/tsv_dir |
| 29 | + |
| 30 | +Input: |
| 31 | + (1) audio_dir (string) path to dir of audio downloaded from Common Voice |
| 32 | + (2) tsv_dir (string) path to dir containing {train,test,dev}.tsv files |
| 33 | + which were generated by CorporaCreator |
| 34 | + |
| 35 | +Ouput: |
| 36 | + (1) csv files in format needed by DeepSpeech.py, saved into audio_dir |
| 37 | + (2) wav files, saved into audio_dir alongside their mp3s |
| 38 | +''' |
| 39 | + |
| 40 | +FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript'] |
| 41 | +SAMPLE_RATE = 16000 |
| 42 | +MAX_SECS = 10 |
| 43 | + |
| 44 | +def _preprocess_data(audio_dir, tsv_dir): |
| 45 | + for dataset in ['train','test','dev']: |
| 46 | + input_tsv= path.join(path.abspath(tsv_dir), dataset+".tsv") |
| 47 | + if os.path.isfile(input_tsv): |
| 48 | + print("Loading TSV file: ", input_tsv) |
| 49 | + _maybe_convert_set(audio_dir, input_tsv) |
| 50 | + else: |
| 51 | + print("ERROR: no TSV file found: ", input_tsv) |
| 52 | + |
| 53 | +def _maybe_convert_set(audio_dir, input_tsv): |
| 54 | + output_csv = path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv')) |
| 55 | + print("Saving new DeepSpeech-formatted CSV file to: ", output_csv) |
| 56 | + |
| 57 | + # Get audiofile path and transcript for each sentence in tsv |
| 58 | + samples = [] |
| 59 | + with open(input_tsv) as input_tsv_file: |
| 60 | + reader = csv.DictReader(input_tsv_file, delimiter='\t') |
| 61 | + for row in reader: |
| 62 | + samples.append((row['path'], row['sentence'])) |
| 63 | + |
| 64 | + # Keep track of how many samples are good vs. problematic |
| 65 | + counter = { 'all': 0, 'too_short': 0, 'too_long': 0 } |
| 66 | + lock = RLock() |
| 67 | + num_samples = len(samples) |
| 68 | + rows = [] |
| 69 | + |
| 70 | + def one_sample(sample): |
| 71 | + """ Take a audio file, and optionally convert it to 16kHz WAV """ |
| 72 | + mp3_filename = path.join(audio_dir, sample[0]) |
| 73 | + # Storing wav files next to the mp3 ones - just with a different suffix |
| 74 | + wav_filename = path.splitext(mp3_filename)[0] + ".wav" |
| 75 | + _maybe_convert_wav(mp3_filename, wav_filename) |
| 76 | + frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT)) |
| 77 | + file_size = path.getsize(wav_filename) |
| 78 | + with lock: |
| 79 | + if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])): |
| 80 | + # Excluding samples that are too short to fit the transcript |
| 81 | + counter['too_short'] += 1 |
| 82 | + elif frames/SAMPLE_RATE > MAX_SECS: |
| 83 | + # Excluding very long samples to keep a reasonable batch-size |
| 84 | + counter['too_long'] += 1 |
| 85 | + else: |
| 86 | + # This one is good - keep it for the target CSV |
| 87 | + rows.append((wav_filename, file_size, sample[1])) |
| 88 | + counter['all'] += 1 |
| 89 | + |
| 90 | + print("Importing mp3 files...") |
| 91 | + pool = Pool(cpu_count()) |
| 92 | + bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR) |
| 93 | + for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1): |
| 94 | + bar.update(i) |
| 95 | + bar.update(num_samples) |
| 96 | + pool.close() |
| 97 | + pool.join() |
| 98 | + |
| 99 | + with open(output_csv, 'w') as output_csv_file: |
| 100 | + print('Writing CSV file for DeepSpeech.py as: ', output_csv) |
| 101 | + writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES) |
| 102 | + writer.writeheader() |
| 103 | + bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR) |
| 104 | + for filename, file_size, transcript in bar(rows): |
| 105 | + writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript }) |
| 106 | + |
| 107 | + print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long'])) |
| 108 | + if counter['too_short'] > 0: |
| 109 | + print('Skipped %d samples that were too short to match the transcript.' % counter['too_short']) |
| 110 | + if counter['too_long'] > 0: |
| 111 | + print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS)) |
| 112 | + |
| 113 | +def _maybe_convert_wav(mp3_filename, wav_filename): |
| 114 | + if not path.exists(wav_filename): |
| 115 | + transformer = Transformer() |
| 116 | + transformer.convert(samplerate=SAMPLE_RATE) |
| 117 | + transformer.build(mp3_filename, wav_filename) |
| 118 | + |
| 119 | +if __name__ == "__main__": |
| 120 | + audio_dir = sys.argv[1] |
| 121 | + tsv_dir = sys.argv[2] |
| 122 | + print('Expecting your audio from Common Voice to be in: ', audio_dir) |
| 123 | + print('Looking for *.tsv files (generated by CorporaCreator) in: ', tsv_dir) |
| 124 | + _preprocess_data(audio_dir, tsv_dir) |
0 commit comments