# This file was auto-generated by Fern from our API Definition.

import json
import typing
from json.decoder import JSONDecodeError

from .. import core
from ..core.api_error import ApiError
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
from ..core.http_response import AsyncHttpResponse, HttpResponse
from ..core.jsonable_encoder import jsonable_encoder
from ..core.request_options import RequestOptions
from ..core.unchecked_base_model import construct_type
from ..errors.unprocessable_entity_error import UnprocessableEntityError
from ..types.additional_formats import AdditionalFormats
from ..types.http_validation_error import HttpValidationError
from .types.speech_to_text_convert_request_file_format import SpeechToTextConvertRequestFileFormat
from .types.speech_to_text_convert_request_timestamps_granularity import SpeechToTextConvertRequestTimestampsGranularity
from .types.speech_to_text_convert_request_webhook_metadata import SpeechToTextConvertRequestWebhookMetadata
from .types.speech_to_text_convert_response import SpeechToTextConvertResponse

# this is used as the default value for optional parameters
OMIT = typing.cast(typing.Any, ...)


class RawSpeechToTextClient:
    def __init__(self, *, client_wrapper: SyncClientWrapper):
        self._client_wrapper = client_wrapper

    def convert(
        self,
        *,
        model_id: str,
        enable_logging: typing.Optional[bool] = None,
        file: typing.Optional[core.File] = OMIT,
        language_code: typing.Optional[str] = OMIT,
        tag_audio_events: typing.Optional[bool] = OMIT,
        num_speakers: typing.Optional[int] = OMIT,
        timestamps_granularity: typing.Optional[SpeechToTextConvertRequestTimestampsGranularity] = OMIT,
        diarize: typing.Optional[bool] = OMIT,
        diarization_threshold: typing.Optional[float] = OMIT,
        additional_formats: typing.Optional[AdditionalFormats] = OMIT,
        file_format: typing.Optional[SpeechToTextConvertRequestFileFormat] = OMIT,
        cloud_storage_url: typing.Optional[str] = OMIT,
        webhook: typing.Optional[bool] = OMIT,
        webhook_id: typing.Optional[str] = OMIT,
        temperature: typing.Optional[float] = OMIT,
        seed: typing.Optional[int] = OMIT,
        use_multi_channel: typing.Optional[bool] = OMIT,
        webhook_metadata: typing.Optional[SpeechToTextConvertRequestWebhookMetadata] = OMIT,
        request_options: typing.Optional[RequestOptions] = None,
    ) -> HttpResponse[SpeechToTextConvertResponse]:
        """
        Transcribe an audio or video file. If webhook is set to true, the request will be processed asynchronously and results sent to configured webhooks. When use_multi_channel is true and the provided audio has multiple channels, a 'transcripts' object with separate transcripts for each channel is returned. Otherwise, returns a single transcript. The optional webhook_metadata parameter allows you to attach custom data that will be included in webhook responses for request correlation and tracking.

        Parameters
        ----------
        model_id : str
            The ID of the model to use for transcription, currently only 'scribe_v1' and 'scribe_v1_experimental' are available.

        enable_logging : typing.Optional[bool]
            When enable_logging is set to false zero retention mode will be used for the request. This will mean log and transcript storage features are unavailable for this request. Zero retention mode may only be used by enterprise customers.

        file : typing.Optional[core.File]
            See core.File for more documentation

        language_code : typing.Optional[str]
            An ISO-639-1 or ISO-639-3 language_code corresponding to the language of the audio file. Can sometimes improve transcription performance if known beforehand. Defaults to null, in this case the language is predicted automatically.

        tag_audio_events : typing.Optional[bool]
            Whether to tag audio events like (laughter), (footsteps), etc. in the transcription.

        num_speakers : typing.Optional[int]
            The maximum amount of speakers talking in the uploaded file. Can help with predicting who speaks when. The maximum amount of speakers that can be predicted is 32. Defaults to null, in this case the amount of speakers is set to the maximum value the model supports.

        timestamps_granularity : typing.Optional[SpeechToTextConvertRequestTimestampsGranularity]
            The granularity of the timestamps in the transcription. 'word' provides word-level timestamps and 'character' provides character-level timestamps per word.

        diarize : typing.Optional[bool]
            Whether to annotate which speaker is currently talking in the uploaded file.

        diarization_threshold : typing.Optional[float]
            Diarization threshold to apply during speaker diarization. A higher value means there will be a lower chance of one speaker being diarized as two different speakers but also a higher chance of two different speakers being diarized as one speaker (less total speakers predicted). A low value means there will be a higher chance of one speaker being diarized as two different speakers but also a lower chance of two different speakers being diarized as one speaker (more total speakers predicted). Can only be set when diarize=True and num_speakers=None. Defaults to None, in which case we will choose a threshold based on the model_id (0.22 usually).

        additional_formats : typing.Optional[AdditionalFormats]
            A list of additional formats to export the transcript to.

        file_format : typing.Optional[SpeechToTextConvertRequestFileFormat]
            The format of input audio. Options are 'pcm_s16le_16' or 'other' For `pcm_s16le_16`, the input audio must be 16-bit PCM at a 16kHz sample rate, single channel (mono), and little-endian byte order. Latency will be lower than with passing an encoded waveform.

        cloud_storage_url : typing.Optional[str]
            The HTTPS URL of the file to transcribe. Exactly one of the file or cloud_storage_url parameters must be provided. The file must be accessible via HTTPS and the file size must be less than 2GB. Any valid HTTPS URL is accepted, including URLs from cloud storage providers (AWS S3, Google Cloud Storage, Cloudflare R2, etc.), CDNs, or any other HTTPS source. URLs can be pre-signed or include authentication tokens in query parameters.

        webhook : typing.Optional[bool]
            Whether to send the transcription result to configured speech-to-text webhooks.  If set the request will return early without the transcription, which will be delivered later via webhook.

        webhook_id : typing.Optional[str]
            Optional specific webhook ID to send the transcription result to. Only valid when webhook is set to true. If not provided, transcription will be sent to all configured speech-to-text webhooks.

        temperature : typing.Optional[float]
            Controls the randomness of the transcription output. Accepts values between 0.0 and 2.0, where higher values result in more diverse and less deterministic results. If omitted, we will use a temperature based on the model you selected which is usually 0.

        seed : typing.Optional[int]
            If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same seed and parameters should return the same result. Determinism is not guaranteed. Must be an integer between 0 and 2147483647.

        use_multi_channel : typing.Optional[bool]
            Whether the audio file contains multiple channels where each channel contains a single speaker. When enabled, each channel will be transcribed independently and the results will be combined. Each word in the response will include a 'channel_index' field indicating which channel it was spoken on. A maximum of 5 channels is supported.

        webhook_metadata : typing.Optional[SpeechToTextConvertRequestWebhookMetadata]
            Optional metadata to be included in the webhook response. This should be a JSON string representing an object with a maximum depth of 2 levels and maximum size of 16KB. Useful for tracking internal IDs, job references, or other contextual information.

        request_options : typing.Optional[RequestOptions]
            Request-specific configuration.

        Returns
        -------
        HttpResponse[SpeechToTextConvertResponse]
            Synchronous transcription result
        """
        _response = self._client_wrapper.httpx_client.request(
            "v1/speech-to-text",
            method="POST",
            params={
                "enable_logging": enable_logging,
            },
            data={
                "model_id": model_id,
                "language_code": language_code,
                "tag_audio_events": tag_audio_events,
                "num_speakers": num_speakers,
                "timestamps_granularity": timestamps_granularity,
                "diarize": diarize,
                "diarization_threshold": diarization_threshold,
                "file_format": file_format,
                "cloud_storage_url": cloud_storage_url,
                "webhook": webhook,
                "webhook_id": webhook_id,
                "temperature": temperature,
                "seed": seed,
                "use_multi_channel": use_multi_channel,
                "webhook_metadata": webhook_metadata,
            },
            files={
                **({"file": file} if file is not None else {}),
                **(
                    {"additional_formats": (None, json.dumps(jsonable_encoder(additional_formats)), "application/json")}
                    if additional_formats is not OMIT
                    else {}
                ),
            },
            request_options=request_options,
            omit=OMIT,
            force_multipart=True,
        )
        try:
            if 200 <= _response.status_code < 300:
                _data = typing.cast(
                    SpeechToTextConvertResponse,
                    construct_type(
                        type_=SpeechToTextConvertResponse,  # type: ignore
                        object_=_response.json(),
                    ),
                )
                return HttpResponse(response=_response, data=_data)
            if _response.status_code == 422:
                raise UnprocessableEntityError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        HttpValidationError,
                        construct_type(
                            type_=HttpValidationError,  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            _response_json = _response.json()
        except JSONDecodeError:
            raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text)
        raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json)


class AsyncRawSpeechToTextClient:
    def __init__(self, *, client_wrapper: AsyncClientWrapper):
        self._client_wrapper = client_wrapper

    async def convert(
        self,
        *,
        model_id: str,
        enable_logging: typing.Optional[bool] = None,
        file: typing.Optional[core.File] = OMIT,
        language_code: typing.Optional[str] = OMIT,
        tag_audio_events: typing.Optional[bool] = OMIT,
        num_speakers: typing.Optional[int] = OMIT,
        timestamps_granularity: typing.Optional[SpeechToTextConvertRequestTimestampsGranularity] = OMIT,
        diarize: typing.Optional[bool] = OMIT,
        diarization_threshold: typing.Optional[float] = OMIT,
        additional_formats: typing.Optional[AdditionalFormats] = OMIT,
        file_format: typing.Optional[SpeechToTextConvertRequestFileFormat] = OMIT,
        cloud_storage_url: typing.Optional[str] = OMIT,
        webhook: typing.Optional[bool] = OMIT,
        webhook_id: typing.Optional[str] = OMIT,
        temperature: typing.Optional[float] = OMIT,
        seed: typing.Optional[int] = OMIT,
        use_multi_channel: typing.Optional[bool] = OMIT,
        webhook_metadata: typing.Optional[SpeechToTextConvertRequestWebhookMetadata] = OMIT,
        request_options: typing.Optional[RequestOptions] = None,
    ) -> AsyncHttpResponse[SpeechToTextConvertResponse]:
        """
        Transcribe an audio or video file. If webhook is set to true, the request will be processed asynchronously and results sent to configured webhooks. When use_multi_channel is true and the provided audio has multiple channels, a 'transcripts' object with separate transcripts for each channel is returned. Otherwise, returns a single transcript. The optional webhook_metadata parameter allows you to attach custom data that will be included in webhook responses for request correlation and tracking.

        Parameters
        ----------
        model_id : str
            The ID of the model to use for transcription, currently only 'scribe_v1' and 'scribe_v1_experimental' are available.

        enable_logging : typing.Optional[bool]
            When enable_logging is set to false zero retention mode will be used for the request. This will mean log and transcript storage features are unavailable for this request. Zero retention mode may only be used by enterprise customers.

        file : typing.Optional[core.File]
            See core.File for more documentation

        language_code : typing.Optional[str]
            An ISO-639-1 or ISO-639-3 language_code corresponding to the language of the audio file. Can sometimes improve transcription performance if known beforehand. Defaults to null, in this case the language is predicted automatically.

        tag_audio_events : typing.Optional[bool]
            Whether to tag audio events like (laughter), (footsteps), etc. in the transcription.

        num_speakers : typing.Optional[int]
            The maximum amount of speakers talking in the uploaded file. Can help with predicting who speaks when. The maximum amount of speakers that can be predicted is 32. Defaults to null, in this case the amount of speakers is set to the maximum value the model supports.

        timestamps_granularity : typing.Optional[SpeechToTextConvertRequestTimestampsGranularity]
            The granularity of the timestamps in the transcription. 'word' provides word-level timestamps and 'character' provides character-level timestamps per word.

        diarize : typing.Optional[bool]
            Whether to annotate which speaker is currently talking in the uploaded file.

        diarization_threshold : typing.Optional[float]
            Diarization threshold to apply during speaker diarization. A higher value means there will be a lower chance of one speaker being diarized as two different speakers but also a higher chance of two different speakers being diarized as one speaker (less total speakers predicted). A low value means there will be a higher chance of one speaker being diarized as two different speakers but also a lower chance of two different speakers being diarized as one speaker (more total speakers predicted). Can only be set when diarize=True and num_speakers=None. Defaults to None, in which case we will choose a threshold based on the model_id (0.22 usually).

        additional_formats : typing.Optional[AdditionalFormats]
            A list of additional formats to export the transcript to.

        file_format : typing.Optional[SpeechToTextConvertRequestFileFormat]
            The format of input audio. Options are 'pcm_s16le_16' or 'other' For `pcm_s16le_16`, the input audio must be 16-bit PCM at a 16kHz sample rate, single channel (mono), and little-endian byte order. Latency will be lower than with passing an encoded waveform.

        cloud_storage_url : typing.Optional[str]
            The HTTPS URL of the file to transcribe. Exactly one of the file or cloud_storage_url parameters must be provided. The file must be accessible via HTTPS and the file size must be less than 2GB. Any valid HTTPS URL is accepted, including URLs from cloud storage providers (AWS S3, Google Cloud Storage, Cloudflare R2, etc.), CDNs, or any other HTTPS source. URLs can be pre-signed or include authentication tokens in query parameters.

        webhook : typing.Optional[bool]
            Whether to send the transcription result to configured speech-to-text webhooks.  If set the request will return early without the transcription, which will be delivered later via webhook.

        webhook_id : typing.Optional[str]
            Optional specific webhook ID to send the transcription result to. Only valid when webhook is set to true. If not provided, transcription will be sent to all configured speech-to-text webhooks.

        temperature : typing.Optional[float]
            Controls the randomness of the transcription output. Accepts values between 0.0 and 2.0, where higher values result in more diverse and less deterministic results. If omitted, we will use a temperature based on the model you selected which is usually 0.

        seed : typing.Optional[int]
            If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same seed and parameters should return the same result. Determinism is not guaranteed. Must be an integer between 0 and 2147483647.

        use_multi_channel : typing.Optional[bool]
            Whether the audio file contains multiple channels where each channel contains a single speaker. When enabled, each channel will be transcribed independently and the results will be combined. Each word in the response will include a 'channel_index' field indicating which channel it was spoken on. A maximum of 5 channels is supported.

        webhook_metadata : typing.Optional[SpeechToTextConvertRequestWebhookMetadata]
            Optional metadata to be included in the webhook response. This should be a JSON string representing an object with a maximum depth of 2 levels and maximum size of 16KB. Useful for tracking internal IDs, job references, or other contextual information.

        request_options : typing.Optional[RequestOptions]
            Request-specific configuration.

        Returns
        -------
        AsyncHttpResponse[SpeechToTextConvertResponse]
            Synchronous transcription result
        """
        _response = await self._client_wrapper.httpx_client.request(
            "v1/speech-to-text",
            method="POST",
            params={
                "enable_logging": enable_logging,
            },
            data={
                "model_id": model_id,
                "language_code": language_code,
                "tag_audio_events": tag_audio_events,
                "num_speakers": num_speakers,
                "timestamps_granularity": timestamps_granularity,
                "diarize": diarize,
                "diarization_threshold": diarization_threshold,
                "file_format": file_format,
                "cloud_storage_url": cloud_storage_url,
                "webhook": webhook,
                "webhook_id": webhook_id,
                "temperature": temperature,
                "seed": seed,
                "use_multi_channel": use_multi_channel,
                "webhook_metadata": webhook_metadata,
            },
            files={
                **({"file": file} if file is not None else {}),
                **(
                    {"additional_formats": (None, json.dumps(jsonable_encoder(additional_formats)), "application/json")}
                    if additional_formats is not OMIT
                    else {}
                ),
            },
            request_options=request_options,
            omit=OMIT,
            force_multipart=True,
        )
        try:
            if 200 <= _response.status_code < 300:
                _data = typing.cast(
                    SpeechToTextConvertResponse,
                    construct_type(
                        type_=SpeechToTextConvertResponse,  # type: ignore
                        object_=_response.json(),
                    ),
                )
                return AsyncHttpResponse(response=_response, data=_data)
            if _response.status_code == 422:
                raise UnprocessableEntityError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        HttpValidationError,
                        construct_type(
                            type_=HttpValidationError,  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            _response_json = _response.json()
        except JSONDecodeError:
            raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text)
        raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json)