2
2
3
3
import { APIResource } from '../../../resource' ;
4
4
import * as RealtimeAPI from './realtime' ;
5
+ import * as Shared from '../../shared' ;
5
6
import * as SessionsAPI from './sessions' ;
6
7
import {
7
8
Session as SessionsAPISession ,
@@ -741,9 +742,38 @@ export interface RealtimeResponse {
741
742
id ?: string ;
742
743
743
744
/**
744
- * Developer-provided string key-value pairs associated with this response.
745
+ * Which conversation the response is added to, determined by the `conversation`
746
+ * field in the `response.create` event. If `auto`, the response will be added to
747
+ * the default conversation and the value of `conversation_id` will be an id like
748
+ * `conv_1234`. If `none`, the response will not be added to any conversation and
749
+ * the value of `conversation_id` will be `null`. If responses are being triggered
750
+ * by server VAD, the response will be added to the default conversation, thus the
751
+ * `conversation_id` will be an id like `conv_1234`.
745
752
*/
746
- metadata ?: unknown | null ;
753
+ conversation_id ?: string ;
754
+
755
+ /**
756
+ * Maximum number of output tokens for a single assistant response, inclusive of
757
+ * tool calls, that was used in this response.
758
+ */
759
+ max_output_tokens ?: number | 'inf' ;
760
+
761
+ /**
762
+ * Set of 16 key-value pairs that can be attached to an object. This can be useful
763
+ * for storing additional information about the object in a structured format, and
764
+ * querying for objects via API or the dashboard.
765
+ *
766
+ * Keys are strings with a maximum length of 64 characters. Values are strings with
767
+ * a maximum length of 512 characters.
768
+ */
769
+ metadata ?: Shared . Metadata | null ;
770
+
771
+ /**
772
+ * The set of modalities the model used to respond. If there are multiple
773
+ * modalities, the model will pick one, for example if `modalities` is
774
+ * `["text", "audio"]`, the model could be responding in either text or audio.
775
+ */
776
+ modalities ?: Array < 'text' | 'audio' > ;
747
777
748
778
/**
749
779
* The object type, must be `realtime.response`.
@@ -755,6 +785,11 @@ export interface RealtimeResponse {
755
785
*/
756
786
output ?: Array < ConversationItem > ;
757
787
788
+ /**
789
+ * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
790
+ */
791
+ output_audio_format ?: 'pcm16' | 'g711_ulaw' | 'g711_alaw' ;
792
+
758
793
/**
759
794
* The final status of the response (`completed`, `cancelled`, `failed`, or
760
795
* `incomplete`).
@@ -766,13 +801,24 @@ export interface RealtimeResponse {
766
801
*/
767
802
status_details ?: RealtimeResponseStatus ;
768
803
804
+ /**
805
+ * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
806
+ */
807
+ temperature ?: number ;
808
+
769
809
/**
770
810
* Usage statistics for the Response, this will correspond to billing. A Realtime
771
811
* API session will maintain a conversation context and append new Items to the
772
812
* Conversation, thus output from previous turns (text and audio tokens) will
773
813
* become the input for later turns.
774
814
*/
775
815
usage ?: RealtimeResponseUsage ;
816
+
817
+ /**
818
+ * The voice the model used to respond. Current voice options are `alloy`, `ash`,
819
+ * `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
820
+ */
821
+ voice ?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse' ;
776
822
}
777
823
778
824
/**
@@ -1320,11 +1366,13 @@ export namespace ResponseCreateEvent {
1320
1366
1321
1367
/**
1322
1368
* Set of 16 key-value pairs that can be attached to an object. This can be useful
1323
- * for storing additional information about the object in a structured format. Keys
1324
- * can be a maximum of 64 characters long and values can be a maximum of 512
1325
- * characters long.
1369
+ * for storing additional information about the object in a structured format, and
1370
+ * querying for objects via API or the dashboard.
1371
+ *
1372
+ * Keys are strings with a maximum length of 64 characters. Values are strings with
1373
+ * a maximum length of 512 characters.
1326
1374
*/
1327
- metadata ?: unknown | null ;
1375
+ metadata ?: Shared . Metadata | null ;
1328
1376
1329
1377
/**
1330
1378
* The set of modalities the model can respond with. To disable audio, set this to
@@ -1716,8 +1764,11 @@ export namespace SessionUpdateEvent {
1716
1764
* Configuration for input audio transcription, defaults to off and can be set to
1717
1765
* `null` to turn off once on. Input audio transcription is not native to the
1718
1766
* model, since the model consumes audio directly. Transcription runs
1719
- * asynchronously through Whisper and should be treated as rough guidance rather
1720
- * than the representation understood by the model.
1767
+ * asynchronously through
1768
+ * [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
1769
+ * and should be treated as rough guidance rather than the representation
1770
+ * understood by the model. The client can optionally set the language and prompt
1771
+ * for transcription, these fields will be passed to the Whisper API.
1721
1772
*/
1722
1773
input_audio_transcription ?: Session . InputAudioTranscription ;
1723
1774
@@ -1801,15 +1852,33 @@ export namespace SessionUpdateEvent {
1801
1852
* Configuration for input audio transcription, defaults to off and can be set to
1802
1853
* `null` to turn off once on. Input audio transcription is not native to the
1803
1854
* model, since the model consumes audio directly. Transcription runs
1804
- * asynchronously through Whisper and should be treated as rough guidance rather
1805
- * than the representation understood by the model.
1855
+ * asynchronously through
1856
+ * [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
1857
+ * and should be treated as rough guidance rather than the representation
1858
+ * understood by the model. The client can optionally set the language and prompt
1859
+ * for transcription, these fields will be passed to the Whisper API.
1806
1860
*/
1807
1861
export interface InputAudioTranscription {
1862
+ /**
1863
+ * The language of the input audio. Supplying the input language in
1864
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
1865
+ * format will improve accuracy and latency.
1866
+ */
1867
+ language ?: string ;
1868
+
1808
1869
/**
1809
1870
* The model to use for transcription, `whisper-1` is the only currently supported
1810
1871
* model.
1811
1872
*/
1812
1873
model ?: string ;
1874
+
1875
+ /**
1876
+ * An optional text to guide the model's style or continue a previous audio
1877
+ * segment. The
1878
+ * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
1879
+ * should match the audio language.
1880
+ */
1881
+ prompt ?: string ;
1813
1882
}
1814
1883
1815
1884
export interface Tool {
0 commit comments