Skip to content

Commit f9a949d

Browse files
feat(api): new models for TTS, STT, + new audio features for Realtime (#1407)
1 parent b4faf0e commit f9a949d

File tree

15 files changed

+1247
-133
lines changed

15 files changed

+1247
-133
lines changed

.stats.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
configured_endpoints: 81
2-
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-b26121d5df6eb5d3032a45a267473798b15fcfec76dd44a3256cf1238be05fa4.yml
1+
configured_endpoints: 82
2+
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c22f59c66aec7914b6ee653d3098d1c1c8c16c180d2a158e819c8ddbf476f74b.yml

api.md

+18
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,11 @@ Types:
141141
Types:
142142

143143
- <code><a href="./src/resources/audio/transcriptions.ts">Transcription</a></code>
144+
- <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionInclude</a></code>
144145
- <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionSegment</a></code>
146+
- <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionStreamEvent</a></code>
147+
- <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionTextDeltaEvent</a></code>
148+
- <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionTextDoneEvent</a></code>
145149
- <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionVerbose</a></code>
146150
- <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionWord</a></code>
147151
- <code><a href="./src/resources/audio/transcriptions.ts">TranscriptionCreateResponse</a></code>
@@ -298,7 +302,9 @@ Types:
298302
- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemDeleteEvent</a></code>
299303
- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemDeletedEvent</a></code>
300304
- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemInputAudioTranscriptionCompletedEvent</a></code>
305+
- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemInputAudioTranscriptionDeltaEvent</a></code>
301306
- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemInputAudioTranscriptionFailedEvent</a></code>
307+
- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemRetrieveEvent</a></code>
302308
- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemTruncateEvent</a></code>
303309
- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemTruncatedEvent</a></code>
304310
- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemWithReference</a></code>
@@ -335,6 +341,8 @@ Types:
335341
- <code><a href="./src/resources/beta/realtime/realtime.ts">SessionCreatedEvent</a></code>
336342
- <code><a href="./src/resources/beta/realtime/realtime.ts">SessionUpdateEvent</a></code>
337343
- <code><a href="./src/resources/beta/realtime/realtime.ts">SessionUpdatedEvent</a></code>
344+
- <code><a href="./src/resources/beta/realtime/realtime.ts">TranscriptionSessionUpdate</a></code>
345+
- <code><a href="./src/resources/beta/realtime/realtime.ts">TranscriptionSessionUpdatedEvent</a></code>
338346

339347
### Sessions
340348

@@ -347,6 +355,16 @@ Methods:
347355

348356
- <code title="post /realtime/sessions">client.beta.realtime.sessions.<a href="./src/resources/beta/realtime/sessions.ts">create</a>({ ...params }) -> SessionCreateResponse</code>
349357

358+
### TranscriptionSessions
359+
360+
Types:
361+
362+
- <code><a href="./src/resources/beta/realtime/transcription-sessions.ts">TranscriptionSession</a></code>
363+
364+
Methods:
365+
366+
- <code title="post /realtime/transcription_sessions">client.beta.realtime.transcriptionSessions.<a href="./src/resources/beta/realtime/transcription-sessions.ts">create</a>({ ...params }) -> TranscriptionSession</code>
367+
350368
## Assistants
351369

352370
Types:

src/resources/audio/audio.ts

+15-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,14 @@ import * as TranscriptionsAPI from './transcriptions';
77
import {
88
Transcription,
99
TranscriptionCreateParams,
10+
TranscriptionCreateParamsNonStreaming,
11+
TranscriptionCreateParamsStreaming,
1012
TranscriptionCreateResponse,
13+
TranscriptionInclude,
1114
TranscriptionSegment,
15+
TranscriptionStreamEvent,
16+
TranscriptionTextDeltaEvent,
17+
TranscriptionTextDoneEvent,
1218
TranscriptionVerbose,
1319
TranscriptionWord,
1420
Transcriptions,
@@ -28,11 +34,12 @@ export class Audio extends APIResource {
2834
speech: SpeechAPI.Speech = new SpeechAPI.Speech(this._client);
2935
}
3036

31-
export type AudioModel = 'whisper-1';
37+
export type AudioModel = 'whisper-1' | 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe';
3238

3339
/**
3440
* The format of the output, in one of these options: `json`, `text`, `srt`,
35-
* `verbose_json`, or `vtt`.
41+
* `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
42+
* the only supported format is `json`.
3643
*/
3744
export type AudioResponseFormat = 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt';
3845

@@ -46,11 +53,17 @@ export declare namespace Audio {
4653
export {
4754
Transcriptions as Transcriptions,
4855
type Transcription as Transcription,
56+
type TranscriptionInclude as TranscriptionInclude,
4957
type TranscriptionSegment as TranscriptionSegment,
58+
type TranscriptionStreamEvent as TranscriptionStreamEvent,
59+
type TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent,
60+
type TranscriptionTextDoneEvent as TranscriptionTextDoneEvent,
5061
type TranscriptionVerbose as TranscriptionVerbose,
5162
type TranscriptionWord as TranscriptionWord,
5263
type TranscriptionCreateResponse as TranscriptionCreateResponse,
5364
type TranscriptionCreateParams as TranscriptionCreateParams,
65+
type TranscriptionCreateParamsNonStreaming as TranscriptionCreateParamsNonStreaming,
66+
type TranscriptionCreateParamsStreaming as TranscriptionCreateParamsStreaming,
5467
};
5568

5669
export {

src/resources/audio/index.ts

+6
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,17 @@ export { Speech, type SpeechModel, type SpeechCreateParams } from './speech';
55
export {
66
Transcriptions,
77
type Transcription,
8+
type TranscriptionInclude,
89
type TranscriptionSegment,
10+
type TranscriptionStreamEvent,
11+
type TranscriptionTextDeltaEvent,
12+
type TranscriptionTextDoneEvent,
913
type TranscriptionVerbose,
1014
type TranscriptionWord,
1115
type TranscriptionCreateResponse,
1216
type TranscriptionCreateParams,
17+
type TranscriptionCreateParamsNonStreaming,
18+
type TranscriptionCreateParamsStreaming,
1319
} from './transcriptions';
1420
export {
1521
Translations,

src/resources/audio/speech.ts

+8-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ export class Speech extends APIResource {
1818
}
1919
}
2020

21-
export type SpeechModel = 'tts-1' | 'tts-1-hd';
21+
export type SpeechModel = 'tts-1' | 'tts-1-hd' | 'gpt-4o-mini-tts';
2222

2323
export interface SpeechCreateParams {
2424
/**
@@ -28,7 +28,7 @@ export interface SpeechCreateParams {
2828

2929
/**
3030
* One of the available [TTS models](https://platform.openai.com/docs/models#tts):
31-
* `tts-1` or `tts-1-hd`
31+
* `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`.
3232
*/
3333
model: (string & {}) | SpeechModel;
3434

@@ -40,6 +40,12 @@ export interface SpeechCreateParams {
4040
*/
4141
voice: 'alloy' | 'ash' | 'coral' | 'echo' | 'fable' | 'onyx' | 'nova' | 'sage' | 'shimmer';
4242

43+
/**
44+
* Control the voice of your generated audio with additional instructions. Does not
45+
* work with `tts-1` or `tts-1-hd`.
46+
*/
47+
instructions?: string;
48+
4349
/**
4450
* The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`,
4551
* `wav`, and `pcm`.

0 commit comments

Comments
 (0)